aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitlab-ci.yml43
-rw-r--r--README.md55
-rw-r--r--TODO2
-rw-r--r--extra/RUNBOOK.md44
-rw-r--r--extra/blobs/README.md86
-rw-r--r--extra/blobs/minio/README.md74
-rw-r--r--extra/blobs/minio/minio.conf14
-rw-r--r--extra/blobs/seaweedfs/README.md9
-rw-r--r--extra/blobs/tasks.md53
-rw-r--r--extra/docker/README.md11
-rw-r--r--extra/docker/docker-compose.yml39
-rw-r--r--extra/hbase/howto.md (renamed from hbase/howto.md)0
-rw-r--r--extra/hbase/notes.txt (renamed from hbase/notes.txt)0
-rw-r--r--extra/hbase/schema_design.md (renamed from hbase/schema_design.md)0
-rw-r--r--extra/nginx/README.md (renamed from nginx/README.md)0
-rw-r--r--extra/nginx/fatcat-blobs (renamed from nginx/fatcat-blobs)0
-rw-r--r--extra/nginx/sandcrawler-db (renamed from nginx/sandcrawler-db)0
-rw-r--r--extra/nginx/sandcrawler-minio (renamed from nginx/sandcrawler-minio)0
-rw-r--r--kafka/debugging_issues.txt9
-rw-r--r--kafka/howto_rebalance.md43
-rw-r--r--kafka/monitoring_commands.md4
-rw-r--r--kafka/topics.md107
-rw-r--r--minio/README.md31
-rw-r--r--notes/dryad_datasets.md17
-rw-r--r--notes/examples/2021-11-12_broken_grobid_xml.md83
-rw-r--r--notes/examples/dataset_examples.txt52
-rw-r--r--notes/examples/html_test_journals.txt153
-rw-r--r--notes/examples/random_datasets.md19
-rw-r--r--notes/fuzzy_match_notes.md148
-rw-r--r--notes/grobid_munging.txt70
-rw-r--r--notes/hadoop_job_log.md210
-rw-r--r--notes/html_ingest_notes.md318
-rw-r--r--notes/ingest/.gitignore2
-rw-r--r--notes/ingest/2019-10-23_testing.md8
-rw-r--r--notes/ingest/2020-01-14_bulk.md26
-rw-r--r--notes/ingest/2020-02-04_ingest_backfills.md148
-rw-r--r--notes/ingest/2020-02-18_ingest_backfills.md42
-rw-r--r--notes/ingest/2020-02-21_ingest_backfills.md104
-rw-r--r--notes/ingest/2020-02-22_fixed_domain.txt246
-rw-r--r--notes/ingest/2020-02_unpaywall.md624
-rw-r--r--notes/ingest/2020-03-02_ingests.txt174
-rw-r--r--notes/ingest/2020-03-oa_but_not_marked.md25
-rw-r--r--notes/ingest/2020-03_mag.md576
-rw-r--r--notes/ingest/2020-03_s2.md35
-rw-r--r--notes/ingest/2020-04-13_covid19.md73
-rw-r--r--notes/ingest/2020-04_datacite.md121
-rw-r--r--notes/ingest/2020-04_unpaywall.md312
-rw-r--r--notes/ingest/2020-05_oai_pmh.md428
-rw-r--r--notes/ingest/2020-05_pubmed.md10
-rw-r--r--notes/ingest/2020-07_mag.md353
-rw-r--r--notes/ingest/2020-08_daily_improvements.md202
-rw-r--r--notes/ingest/2020-09_oa_doi.md352
-rw-r--r--notes/ingest/2020-09_reingest.md197
-rw-r--r--notes/ingest/2020-09_scielo.md21
-rw-r--r--notes/ingest/2020-10_daily.md193
-rw-r--r--notes/ingest/2020-10_unpaywall.md286
-rw-r--r--notes/ingest/2020-11-04_arxiv.md12
-rw-r--r--notes/ingest/2020-11_doaj.md295
-rw-r--r--notes/ingest/2020-12-08_patch_crawl_notes.md111
-rw-r--r--notes/ingest/2021-04_unpaywall.md368
-rw-r--r--notes/ingest/2021-05_daily_improvements.md480
-rw-r--r--notes/ingest/2021-07_unpaywall.md320
-rw-r--r--notes/ingest/2021-08_mag.md400
-rw-r--r--notes/ingest/2021-09-02_oai_pmh_patch.md1578
-rw-r--r--notes/ingest/2021-09-03_daily_improvements.md1021
-rw-r--r--notes/ingest/2021-09-03_patch_crawl.md678
-rw-r--r--notes/ingest/2021-12-13_datasets.md504
-rw-r--r--notes/ingest/2022-01-06_patch_crawl.md398
-rw-r--r--notes/ingest/2022-01-13_doi_crawl.md248
-rw-r--r--notes/ingest/2022-03_doaj.md278
-rw-r--r--notes/ingest/2022-03_oaipmh.md40
-rw-r--r--notes/ingest/2022-04_targeted.md144
-rw-r--r--notes/ingest/2022-04_unpaywall.md278
-rw-r--r--notes/ingest/2022-07-15_ingest_fixes.md831
-rw-r--r--notes/ingest/2022-07-19_dblp.md50
-rw-r--r--notes/ingest/2022-07_doaj.md199
-rw-r--r--notes/ingest/2022-07_targeted.md140
-rw-r--r--notes/ingest/2022-09_oaipmh.md397
-rw-r--r--notes/ingest/NEXT.md52
-rwxr-xr-xnotes/ingest/es_csv_to_json.py37
-rw-r--r--notes/ingest_domains.txt294
-rw-r--r--notes/job_log.txt103
-rw-r--r--notes/possible_ingest_targets.txt15
-rw-r--r--notes/tasks/2020-01-06_heuristic_cdx.txt37
-rw-r--r--notes/tasks/2020-01-27_cleanup_cdx.md34
-rw-r--r--notes/tasks/2020-01-27_grobid_backfill.md40
-rw-r--r--notes/tasks/2020-02-14_pdftrio.md162
-rw-r--r--notes/tasks/2020-07-22_processing_holes.md120
-rw-r--r--notes/tasks/2020-08-20_file_meta.md66
-rw-r--r--notes/tasks/2020-10-21_pdfextract_holes.md74
-rw-r--r--notes/tasks/2021-09-09_pdf_url_lists.md70
-rw-r--r--notes/tasks/2021-10-29_crossref_refs_backfill.md235
-rw-r--r--notes/tasks/2021-12-06_regrobid.md380
-rw-r--r--notes/tasks/2022-01-07_grobid_platform_pdfs.md23
-rw-r--r--notes/tasks/2022-03-07_ukraine_firedrill.md225
-rw-r--r--notes/tasks/2022-04-27_pdf_url_lists.md72
-rw-r--r--notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md132
-rw-r--r--pig/filter-cdx-pdfs.pig24
-rw-r--r--pig/filter-cdx-ps.pig6
-rw-r--r--pig/filter-cdx-source-code-crude.pig40
-rw-r--r--pig/filter-cdx-tarball.pig38
-rw-r--r--pig/join-cdx-sha1.pig43
-rw-r--r--pig/tests/files/example.sha1b324
-rw-r--r--pig/tests/files/sourcecode.cdx6
-rw-r--r--pig/tests/files/tarballs.cdx10
-rw-r--r--pig/tests/pighelper.py5
-rw-r--r--pig/tests/test_filter_cdx_paper_pdfs.py4
-rw-r--r--pig/tests/test_filter_software.py16
-rw-r--r--pig/tests/test_join_cdx.py44
-rwxr-xr-xplease26
-rw-r--r--proposals/2018_original_sandcrawler_rfc.md (renamed from sandcrawler-rfc.md)2
-rw-r--r--proposals/2019_ingest.md287
-rw-r--r--proposals/2019_pdftotext_pdfinfo.md123
-rw-r--r--proposals/20200129_pdf_ingest.md272
-rw-r--r--proposals/20200207_pdftrio.md107
-rw-r--r--proposals/20200211_nsq.md79
-rw-r--r--proposals/20201012_no_capture.md39
-rw-r--r--proposals/20201026_html_ingest.md129
-rw-r--r--proposals/20201103_xml_ingest.md64
-rw-r--r--proposals/2020_pdf_meta_thumbnails.md328
-rw-r--r--proposals/2020_seaweed_s3.md426
-rw-r--r--proposals/2021-04-22_crossref_db.md86
-rw-r--r--proposals/2021-09-09_component_ingest.md114
-rw-r--r--proposals/2021-09-09_fileset_ingest.md343
-rw-r--r--proposals/2021-09-13_src_ingest.md53
-rw-r--r--proposals/2021-09-21_spn_accounts.md14
-rw-r--r--proposals/2021-10-28_grobid_refs.md125
-rw-r--r--proposals/2021-12-09_trawling.md180
-rw-r--r--proposals/brainstorm/2021-debug_web_interface.md9
-rw-r--r--proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md36
-rw-r--r--proposals/schema_changes.sql40
-rw-r--r--python/.coveragerc3
-rw-r--r--python/.flake821
-rw-r--r--python/.gitignore13
-rw-r--r--python/.pylintrc2
-rw-r--r--python/Makefile32
-rw-r--r--python/Pipfile42
-rw-r--r--python/Pipfile.lock2139
-rw-r--r--python/README.md46
-rw-r--r--python/TODO1
-rw-r--r--python/common.py99
-rw-r--r--python/example.env8
-rwxr-xr-xpython/grobid2json.py181
-rwxr-xr-xpython/grobid_tool.py174
-rwxr-xr-xpython/ia_pdf_match.py97
-rwxr-xr-xpython/ingest_file.py386
-rwxr-xr-xpython/ingest_tool.py244
-rwxr-xr-xpython/kafka_grobid.py327
-rwxr-xr-xpython/pdfextract_tool.py151
-rwxr-xr-xpython/pdftrio_tool.py139
-rwxr-xr-xpython/persist_tool.py311
-rw-r--r--python/pyproject.toml7
-rw-r--r--python/pytest.ini16
-rw-r--r--python/sandcrawler/__init__.py55
-rw-r--r--python/sandcrawler/db.py650
-rw-r--r--python/sandcrawler/fileset_platforms.py832
-rw-r--r--python/sandcrawler/fileset_strategies.py387
-rw-r--r--python/sandcrawler/fileset_types.py74
-rw-r--r--python/sandcrawler/grobid.py410
-rw-r--r--python/sandcrawler/html.py365
-rw-r--r--python/sandcrawler/html_metadata.py1077
-rw-r--r--python/sandcrawler/ia.py1457
-rw-r--r--python/sandcrawler/ingest_file.py925
-rw-r--r--python/sandcrawler/ingest_fileset.py516
-rw-r--r--python/sandcrawler/ingest_html.py499
-rw-r--r--python/sandcrawler/minio.py118
-rw-r--r--python/sandcrawler/misc.py248
-rw-r--r--python/sandcrawler/pdfextract.py502
-rw-r--r--python/sandcrawler/pdftrio.py142
-rw-r--r--python/sandcrawler/persist.py785
-rw-r--r--python/sandcrawler/workers.py578
-rw-r--r--python/sandcrawler/xml.py6
-rwxr-xr-xpython/sandcrawler_worker.py495
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py74
-rwxr-xr-xpython/scripts/archiveorg_fileset.py135
-rwxr-xr-xpython/scripts/cdx_collection.py82
-rwxr-xr-xpython/scripts/covid2ingestrequest.py90
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py94
-rwxr-xr-xpython/scripts/deliver_gwb_to_disk.py193
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py182
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py144
-rwxr-xr-xpython/scripts/enrich_scored_matches.py19
-rwxr-xr-xpython/scripts/fetch_cdx_sha1hex.py170
-rwxr-xr-xpython/scripts/filter_grobid_metadata.py127
-rwxr-xr-xpython/scripts/filter_groupworks.py48
-rwxr-xr-xpython/scripts/filter_scored_matches.py49
-rwxr-xr-xpython/scripts/grobid_affiliations.py37
-rwxr-xr-xpython/scripts/import_grobid_metadata.py69
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py59
-rwxr-xr-xpython/scripts/manifest_converter.py7
-rwxr-xr-xpython/scripts/oai2ingestrequest.py177
-rwxr-xr-xpython/scripts/pdf_thumbnail.py38
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py111
-rw-r--r--python/tests/files/crossref_api_work_978-3-030-64953-1_4.json1
-rw-r--r--python/tests/files/crossref_api_work_s1047951103000064.json1
-rw-r--r--python/tests/files/dlib_05vanhyning.html350
-rw-r--r--python/tests/files/elife_article.html3094
-rw-r--r--python/tests/files/first_monday_ojs3_fulltext.html441
-rw-r--r--python/tests/files/first_monday_ojs3_landingpage.html616
-rw-r--r--python/tests/files/genders_g58_fairlie.html146
-rw-r--r--python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml66
-rw-r--r--python/tests/files/grobid_refs_s1047951103000064.tei.xml499
-rw-r--r--python/tests/files/nature_article.html1379
-rw-r--r--python/tests/files/peerj_oa_article.html2365
-rw-r--r--python/tests/files/plos_one_article.html1707
-rw-r--r--python/tests/files/scielo_article.jats.xml336
-rw-r--r--python/tests/files/small.json18
-rw-r--r--python/tests/test_common.py40
-rw-r--r--python/tests/test_grobid.py227
-rw-r--r--python/tests/test_grobid2json.py26
-rw-r--r--python/tests/test_html.py7
-rw-r--r--python/tests/test_html_ingest.py10
-rw-r--r--python/tests/test_html_metadata.py261
-rw-r--r--python/tests/test_ingest.py264
-rw-r--r--python/tests/test_live_wayback.py181
-rw-r--r--python/tests/test_misc.py99
-rw-r--r--python/tests/test_pdfextract.py71
-rw-r--r--python/tests/test_pushers.py33
-rw-r--r--python/tests/test_savepagenow.py331
-rw-r--r--python/tests/test_wayback.py297
-rw-r--r--python/tests/test_xml.py17
l---------python/title_slug_denylist.txt (renamed from python/title_slug_blacklist.txt)0
-rw-r--r--python_hadoop/README.md8
-rw-r--r--scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala187
-rw-r--r--scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala175
-rw-r--r--sql/Makefile35
-rw-r--r--sql/README.md42
-rw-r--r--sql/backfill/backfill.md13
-rwxr-xr-xsql/backfill/backfill_cdx.py1
-rw-r--r--sql/dump_file_meta.sql12
-rw-r--r--sql/dump_regrobid_pdf.sql15
-rw-r--r--sql/dump_regrobid_pdf_petabox.sql15
-rw-r--r--sql/dump_reingest_bulk.sql31
-rw-r--r--sql/dump_reingest_old.sql36
-rw-r--r--sql/dump_reingest_quarterly.sql47
-rw-r--r--sql/dump_reingest_spn.sql36
-rw-r--r--sql/dump_reingest_terminalstatus.sql34
-rw-r--r--sql/dump_reingest_weekly.sql42
-rw-r--r--sql/dump_unextracted_pdf.sql22
-rw-r--r--sql/dump_unextracted_pdf_petabox.sql18
-rw-r--r--sql/dump_ungrobid_pdf.sql18
-rw-r--r--sql/dump_ungrobid_pdf_petabox.sql17
-rw-r--r--sql/dump_unmatched_glutton_pdf.sql19
-rw-r--r--sql/example.env1
-rw-r--r--sql/ingest_again.md158
-rw-r--r--sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt326
-rw-r--r--sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt307
-rw-r--r--sql/migrations/00000000000000_diesel_initial_setup/down.sql6
-rw-r--r--sql/migrations/00000000000000_diesel_initial_setup/up.sql36
-rw-r--r--sql/migrations/2019-12-19-060141_init/down.sql8
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql245
-rw-r--r--sql/monitoring_queries.md202
-rw-r--r--sql/pdftrio_queries.md65
-rw-r--r--sql/random_queries.md193
-rwxr-xr-xsql/reingest_bulk.sh19
-rwxr-xr-xsql/reingest_old.sh19
-rwxr-xr-xsql/reingest_quarterly.sh19
-rwxr-xr-xsql/reingest_spn.sh19
-rwxr-xr-xsql/reingest_terminalstatus_forcerecrawl.sh19
-rwxr-xr-xsql/reingest_weekly.sh19
l---------[-rw-r--r--]sql/sandcrawler_schema.sql60
-rw-r--r--sql/stats/2020-01-13_stats.txt190
-rw-r--r--sql/stats/2020-01-31_supplement.txt42
-rw-r--r--sql/stats/2020-02-24_stats.txt482
-rw-r--r--sql/stats/2020-05-03_stats.txt418
-rw-r--r--sql/stats/2020-07-23_stats.txt347
-rw-r--r--sql/stats/2020-09-14_stats.txt340
-rw-r--r--sql/stats/2021-04-07_stats.txt430
-rw-r--r--sql/stats/2021-04-08_table_sizes.txt40
-rw-r--r--sql/stats/2021-04-12_ingest_domain_summary_30d.txt345
-rw-r--r--sql/stats/2021-11-01_table_sizes.txt19
-rw-r--r--sql/stats/2021-11-26_stats.txt424
-rw-r--r--sql/stats/2021-12-02_table_sizes.txt22
-rw-r--r--sql/stats/2022-04-26_stats.txt432
-rw-r--r--sql/stats/2022-04-27_crawl_changelog.txt191
-rw-r--r--sql/stats/2022-05-11_crawl_changelog.txt410
-rw-r--r--sql/stats/2022-09-06_stats.txt438
-rw-r--r--sql/stats/2022-11-23_table_sizes.txt21
-rw-r--r--sql/stats/README.md109
-rw-r--r--sql/table_sizes.md11
280 files changed, 54153 insertions, 2765 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7792992..457a250 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,31 +1,39 @@
-image: ubuntu:xenial
-before_script:
- - apt update -qy
- - apt install -y apt-transport-https
- - echo "deb https://dl.bintray.com/sbt/debian /" | tee -a /etc/apt/sources.list.d/sbt.list
- - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
- - apt update -qy
- - apt install -y python3-dev python3-pip python3-wheel libjpeg-dev openjdk-8-jdk-headless sbt
- - pip3 install pipenv
- - pipenv --version
+
+image: ubuntu:focal
variables:
LC_ALL: "C.UTF-8"
LANG: "C.UTF-8"
+ DEBIAN_FRONTEND: "noninteractive"
+
+before_script:
+ - apt update -qy
+ - apt install -y --no-install-recommends apt-transport-https software-properties-common curl dirmngr gpg-agent
+ # scala-sbt.org APT signing key
+ - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 0x2EE0EA64E40A89B84B2DF73499E82A75642AC823
+ - apt-add-repository -y "deb https://repo.scala-sbt.org/scalasbt/debian all main"
+ - apt install -y --no-install-recommends python3-dev python3-pip python3-wheel libjpeg-dev openjdk-8-jdk-headless sbt libpq-dev python-dev python3.8 python3.8-dev python3.8-venv python3.8-distutils pkg-config python3-pytest git libsnappy-dev libsodium-dev libpoppler-cpp-dev cmake libpython3.8-dev build-essential poppler-data libmagic1 pipenv wget
+ - pipenv --version
test_python:
script:
- cd python
+ - cp example.env .env
- pipenv install --dev --deploy
- - pipenv run pytest --cov
+ - make coverage
+ - make lint
test_python_hadoop:
+ when: manual
script:
- cd python_hadoop
- pipenv install --dev --deploy
- pipenv run pytest --cov
+# needs fixing; some upstream com.hadoop.gplcompression#hadoop-lzo;0.4.16: java.lang.NullPointerException
+# change happened
test_scalding:
+ when: manual
script:
- ./please -h
- cd scalding
@@ -33,9 +41,10 @@ test_scalding:
- sbt -mem 1024 assembly
# Needs fixing
-#test_pig:
-# script:
-# - ./fetch_hadoop.sh
-# - cd pig
-# - pipenv install --dev --deploy
-# - JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::") pipenv run pytest
+test_pig:
+ when: manual
+ script:
+ - ./fetch_hadoop.sh
+ - cd pig
+ - pipenv install --dev --deploy
+ - JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::") pipenv run pytest
diff --git a/README.md b/README.md
index 386149d..b29e397 100644
--- a/README.md
+++ b/README.md
@@ -6,35 +6,62 @@
\ooooooo| |___/\__,_|_| |_|\__,_|\___|_| \__,_| \_/\_/ |_|\___|_|
-This repo contains hadoop jobs, luigi tasks, and other scripts and code for the
-internet archive web group's journal ingest pipeline.
+This repo contains back-end python workers, scripts, config files, and other
+stuff related to the Internet Archive web group's scholarly web preservation
+and processing pipeline. It is a complement to [fatcat](https://fatcat.wiki),
+which is an open catalog of research outputs, including preservation metadata.
-Code in tihs repository is potentially public!
+The sandcrawler part of the project deals with content crawled from the web
+into either web.archive.org or archive.org collections, and post-processing
+that content. For example, extracting text from PDF files, verifying mimetypes,
+and checking archival status. The resulting metadata ends up getting filtered,
+transformed, and pushed in to fatcat itself for public use.
+
+While code in this repository is public, it is mostly IA-specific and may not
+even run outside the IA data centers due to library dependencies and
+authentication needs. Code quality and documentation is generally poor compared
+to fatcat.
+
+As of December 2022, the best document to read for "getting started" in
+understanding the ingest system is `proposals/2019_ingest.md`, and then
+subsequent proposals expanding on that foundation.
Archive-specific deployment/production guides and ansible scripts at:
[journal-infra](https://git.archive.org/webgroup/journal-infra)
-**./python/** contains scripts and utilities for
+
+## Repository Layout
+
+**./python/** contains scripts and utilities for ingesting content from wayback
+and/or the web (via save-page-now API), and other processing pipelines. Most of
+the active code is in here. See included README (`./python/README.md`)
**./sql/** contains schema, queries, and backfill scripts for a Postgres SQL
database index (eg, file metadata, CDX, and GROBID status tables).
-**./minio/** contains docs on how to setup and use a minio S3-compatible blob
-store (eg, for GROBID XML output)
+**./python_hadoop/** contains Hadoop streaming jobs written in python using the
+`mrjob` library. Still use the HBase backfill code path occasionally.
-**./scalding/** contains Hadoop jobs written in Scala using the Scalding
-framework. The intent is to write new non-trivial Hadoop jobs in Scala, which
-brings type safety and compiled performance.
+**./proposals/** design documentation and change proposals
-**./python_hadoop/** contains Hadoop streaming jobs written in python using the
-`mrjob` library. Considered deprecated!
+**./notes/ingest/** log of bulk crawls and metadata loads
+
+**./extra/docker/** docker-compose setup that may be useful for documentation
+(includes Kafka, PostgreSQL, etc)
+
+**./.gitlab-ci.yml** current CI setup script, which documents dependencies
**./pig/** contains a handful of Pig scripts, as well as some unittests
-implemented in python.
+implemented in python. Only rarely used.
-## Running Hadoop Jobs
+**./scalding/** contains Hadoop jobs written in Scala using the Scalding
+framework. The intent is to write new non-trivial Hadoop jobs in Scala, which
+brings type safety and compiled performance. Mostly DEPRECATED, this code has
+not been run in years.
+
+
+## Running Python Hadoop Jobs
The `./please` python3 wrapper script is a helper for running jobs (python or
scalding) on the IA Hadoop cluster. You'll need to run the setup/dependency
tasks first; see README files in subdirectories.
-
diff --git a/TODO b/TODO
index 77b48c9..33dc147 100644
--- a/TODO
+++ b/TODO
@@ -1,4 +1,6 @@
+Note: as of 2022 this file is ancient and need review
+
## Kafka Pipelines
- after network split, mass restarting import/harvest stuff seemed to
diff --git a/extra/RUNBOOK.md b/extra/RUNBOOK.md
new file mode 100644
index 0000000..6c4165d
--- /dev/null
+++ b/extra/RUNBOOK.md
@@ -0,0 +1,44 @@
+
+## Process Un-GROBID-ed PDFs from Wayback
+
+Sometimes ingest doesn't pick up everything, or we do some heuristic CDX
+import, and we want to run GROBID over all the PDFs that haven't been processed
+yet. Only want one CDX line per `sha1hex`.
+
+A hybrid SQL/UNIX way of generating processing list:
+
+ psql sandcrawler < /fast/sandcrawler/sql/dump_ungrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_ungrobid_pdf.2020.01-27.json
+
+From here, there are two options: enqueue in Kafka and let workers run, or
+create job files and run them using local worker and GNU/parallel.
+
+#### Kafka
+
+Copy/transfer to a Kafka node; load a sample and then the whole output:
+
+ head -n1000 dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1
+ cat dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1
+
+#### Local JSON
+
+Older example; if this fails, need to re-run entire thing:
+
+ cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+TODO: is it possible to use job log with millions of `--pipe` inputs? That
+would be more efficient in the event of failure.
+
+## GROBID over many .zip files
+
+Want to use GNU/Parallel in a mode that will do retries well:
+
+ fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \
+ sort | \
+ parallel -j16 --progress --joblog extract_tasks.log --resume-failed \
+ './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
+
+After starting, check that messages are actually getting pushed to kafka
+(producer failures can be silent!). If anything goes wrong, run the exact same
+command again. The sort is to ensure jobs are enqueued in the same order again;
+could also dump `fd` output to a command file first.
+
diff --git a/extra/blobs/README.md b/extra/blobs/README.md
new file mode 100644
index 0000000..555db92
--- /dev/null
+++ b/extra/blobs/README.md
@@ -0,0 +1,86 @@
+
+This document describes sandcrawler/fatcat use of "blob store" infrastructure
+for storing hundreds of millions of small files. For example, GROBID XML
+documents, jpeg thumbnails of PDFs.
+
+The basic feature requirements for this system are:
+
+- don't need preservation data resiliency: all this data is derived from
+ primary content, and is usually redundantly stored in Kafka topics (and thus
+ can be re-indexed to any server bounded only by throughput of the object
+ store service; Kafka is usually faster)
+- don't require SSDs or large amounts of RAM. Ability to accelerate performance
+ with additional RAM or moving indexes to SSD is nice, but we will be using
+ spinning disks for primary data storage
+- hundreds of millions or billions of objects, fetchable by a key we define
+- optional transparent compression (for text and XML)
+- typical object (file) size of 5-200 KBytes uncompressed, want to support up
+ to several MBytes
+- very simple internal API for GET/PUT (S3 API compatible is good)
+- ability to proxy to HTTP publicly for reads (eg, HTTP fall-back with no
+ authenticaiton), controllable by at least bucket granularity
+
+## Infrastructure
+
+`minio` was used initially, but did not scale well in number of files. We
+currently use seaweedfs. Any S3-compatible key/value store should work in
+theory. openlibrary.org has used WARCs in petabox items in the past. Actual
+cloud object stores tend to be expensive for this kind of use case.
+
+The facebook "haystack" project (and whitepaper) are good background reading
+describing one type of system that works well for this application.
+
+
+## Bucket / Folder Structure
+
+Currently we run everything off a single server, with no redundancy. There is
+no QA/prod distinction.
+
+Setting access control and doing bulk deletions is easiest at the bucket level,
+less easy at the folder level, most difficult at the suffix (file extention)
+level.
+
+For files that are derived from PDFs, we use the SHA-1 (in lower-case hex) of
+the source PDF to contruct keys. We generate nested "directories" from the hash
+to limit the number of keys per "directory" (even though in S3/seaweedfs there
+are no actual directories involved). The structure looks like:
+
+ <bucket>/<folder>/<byte0>/<byte1>/<sha1hex><suffix>
+
+Eg:
+
+ sandcrawler/grobid/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.tei.xml
+
+The nesting is sort of a hold-over from minio (where files were actually
+on-disk), but seems worth keeping in case we end up switching storage systems
+in the future.
+
+## Existing Content
+
+sandcrawler: internal/controlled access to PDF derivatives
+ grobid: TEI-XML documents
+ extension: .tei.xml
+ text: raw pdftotext (or other text transform)
+ extension: .txt
+
+thumbnail: public bucket for thumbnail images
+ pdf: thumbnails from PDF files
+ extension: .180px.jpg
+
+## Proxy and URLs
+
+Internal HTTP access via:
+
+ http://wbgrp-svc169.us.archive.org:8333/<bucket>/<key>
+
+Public access via:
+
+ https://blobs.fatcat.wiki/<bucket>/<key>
+
+Eg:
+
+ http://wbgrp-svc169.us.archive.org:8333/testing/small.txt
+ http://wbgrp-svc169.us.archive.org:8333/sandcrawler/grobid/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.tei.xml
+ https://blobs.fatcat.wiki/testing/small.txt
+ https://blobs.fatcat.wiki/thumbnail/pdf/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.180px.jpg
+
diff --git a/extra/blobs/minio/README.md b/extra/blobs/minio/README.md
new file mode 100644
index 0000000..d8f1c69
--- /dev/null
+++ b/extra/blobs/minio/README.md
@@ -0,0 +1,74 @@
+
+minio is used as an S3-compatible blob store. Initial use case is GROBID XML
+documents, addressed by the sha1 of the PDF file the XML was extracted from.
+
+Note that on the backend minio is just storing objects as files on disk.
+
+## Deploying minio Server
+
+It seems to be important to use a version of minio from at least December 2019
+era for on-disk compression to actually work.
+
+Currently install minio (and mc, the minio client) in prod by simply
+downloading the binaries and calling from systemd.
+
+## Buckets and Directories
+
+Hosts and buckets:
+
+ localhost:sandcrawler-dev
+ create locally for development (see below)
+
+ cluster:sandcrawler
+ main sandcrawler storage bucket, for GROBID output and other derivatives.
+ Note it isn't "sandcrawler-prod", for backwards compatibility reasons.
+
+ cluster:sandcrawler-qa
+ for, eg, testing on cluster servers
+
+ cluster:unpaywall
+ subset of sandcrawler content crawled due to unpaywall URLs;
+ potentially made publicly accessible
+
+Directory structure within sandcrawler buckets:
+
+ grobid/2c/0d/2c0daa9307887a27054d4d1f137514b0fa6c6b2d.tei.xml
+ SHA1 (lower-case hex) of PDF that XML was extracted from
+
+Create new buckets like:
+
+ mc mb cluster/sandcrawler-qa
+
+## Development
+
+Run minio server locally, with non-persisted data:
+
+ docker run -p 9000:9000 minio/minio server /data
+
+Credentials are `minioadmin:minioadmin`. Install `mc` client utility, and
+configure:
+
+ mc config host add localhost http://localhost:9000 minioadmin minioadmin
+
+Then create dev bucket:
+
+ mc mb --ignore-existing localhost/sandcrawler-dev
+
+A common "gotcha" with `mc` command is that it will first look for a local
+folder/directory with same name as the configured remote host, so make sure
+there isn't a `./localhost` folder.
+
+
+## Users
+
+Create a new readonly user like:
+
+ mc admin user add sandcrawler unpaywall $RANDOM_SECRET_KEY readonly
+
+Make a prefix within a bucket world-readable like:
+
+ mc policy set download cluster/unpaywall/grobid
+
+## Config
+
+ mc admin config set aitio compression extensions=.txt,.log,.csv,.json,.tsv,.pdf,.xml mime_types=text/csv,text/plain,application/json,application/xml,application/octet-stream,application/tei+xml
diff --git a/extra/blobs/minio/minio.conf b/extra/blobs/minio/minio.conf
new file mode 100644
index 0000000..2e93f9a
--- /dev/null
+++ b/extra/blobs/minio/minio.conf
@@ -0,0 +1,14 @@
+
+# Volume to be used for MinIO server.
+MINIO_VOLUMES="/sandcrawler-minio/data"
+# Use if you want to run MinIO on a custom port.
+MINIO_OPTS="--address :9000"
+# Access Key of the server.
+MINIO_ACCESS_KEY=REDACTED
+# Secret key of the server.
+MINIO_SECRET_KEY=REDACTED
+
+# may need to set these manually using `mc admin config get`, edit the JSON, then `set`
+MINIO_COMPRESS="on"
+MINIO_COMPRESS_EXTENSIONS=".txt,.log,.csv,.json,.tar,.xml,.bin,.pdf,.tsv"
+MINIO_COMPRESS_MIME_TYPES="text/*,application/json,application/xml,application/pdf,application/octet-stream"
diff --git a/extra/blobs/seaweedfs/README.md b/extra/blobs/seaweedfs/README.md
new file mode 100644
index 0000000..d19e9e0
--- /dev/null
+++ b/extra/blobs/seaweedfs/README.md
@@ -0,0 +1,9 @@
+
+## HOWTO: Create new bucket in SeaweedFS
+
+Log in to the seaweedfs VM.
+
+Run `weed shell` to start a shell, then:
+
+ bucket.create -name <bucket>
+
diff --git a/extra/blobs/tasks.md b/extra/blobs/tasks.md
new file mode 100644
index 0000000..beb765f
--- /dev/null
+++ b/extra/blobs/tasks.md
@@ -0,0 +1,53 @@
+
+## Backfill GROBID XML to Blob Store
+
+Initially ran this when spinning up new seaweedfs server to replace minio. At
+this time grobid persist worker was in db-only mode, as minio was too slow to
+accept uploads. Rough plan is to:
+
+1. run grobid persist worker from Kafka with a new temporary consumer group,
+ from the start of the GROBID output topic
+2. when it gets to end, stop the *regular* consumer group while this one is
+ still running. with temporary worker still running, at that point in time
+ entire topic should be in S3
+3. then reconfigure regular worker to db+s3 mode. halt the temporary worker,
+ restart the regular one with new config, run it indefinitely
+
+Consumer group isn't an arg, so just edit `persist_worker.py` and set it to
+`persist-grobid-seaweedfs`. Also needed to patch a bit so `--s3-only` mode
+didn't try to connect to postgresql.
+
+Commands:
+
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
+ => Consuming from kafka topic sandcrawler-prod.grobid-output-pg, group persist-grobid-seaweed
+ => run briefly, then kill
+
+On kafka-broker worker:
+
+ ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --reset-offsets --to-earliest --group persist-grobid-seaweed --topic sandcrawler-prod.grobid-output-pg --dry-run
+
+Then run 2x instances of worker (same command as above):
+
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only
+
+At this point CPU-limited on this worker by the python processes (only 4 cores
+on this machine).
+
+Check in weed shell:
+
+ weed shell
+
+ > > fs.meta.cat buckets/sandcrawler/grobid/00/00/000068a76ab125389506e8834483c6ba4c73338a.tei.xml
+ [...]
+ "isGzipped": false
+ [...]
+ "mime": "application/xml",
+ [...]
+
+An open question is if we should have separate buckets per derive type. Eg, a
+GROBID XML bucket separate from thumbnails bucket. Or are prefix directories
+enough. Basically this comes down to whether we want things mixed together at
+the volume level. I think we should keep separate.
+
+Need to set the mimetype in the upload for gzip on XML?
diff --git a/extra/docker/README.md b/extra/docker/README.md
new file mode 100644
index 0000000..23cb5b2
--- /dev/null
+++ b/extra/docker/README.md
@@ -0,0 +1,11 @@
+
+The docker-compose script in this directory may be helpful for local
+development. It starts several dependant services, such as Kafka, minio, etc.
+
+PostgreSQL is assumed to be running natively on localhost, not under docker. It
+should be possible to add postgresql to the docker-compose file, but some
+developers (bnewbold) prefer to run it separately to make things like attaching
+with `psql` easier.
+
+There is no current motivation or plan to deploy sandcrawler services using
+docker, so there is no Dockerfile for the system itself.
diff --git a/extra/docker/docker-compose.yml b/extra/docker/docker-compose.yml
new file mode 100644
index 0000000..196879f
--- /dev/null
+++ b/extra/docker/docker-compose.yml
@@ -0,0 +1,39 @@
+version: '2'
+services:
+ zookeeper:
+ image: wurstmeister/zookeeper
+ ports:
+ - "2181:2181"
+ kafka:
+ image: wurstmeister/kafka:2.11-2.0.0
+ ports:
+ - "9092:9092"
+ environment:
+ #HOSTNAME_COMMAND: "docker info | grep ^Name: | cut -d' ' -f 2"
+ KAFKA_BROKER_ID: 1
+ KAFKA_ADVERTISED_HOST_NAME: 127.0.0.1
+ KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
+ KAFKA_CREATE_TOPICS: "fatcat-dev.changelog:1:1,fatcat-dev.release-updates:3:1:compact"
+ KAFKA_MESSAGE_MAX_BYTES: 50000000
+ volumes:
+ - /var/run/docker.sock:/var/run/docker.sock
+ depends_on:
+ - zookeeper
+ postgrest:
+ image: postgrest/postgrest
+ network_mode: "host"
+ ports:
+ - "3000:3000"
+ environment:
+ PGRST_DB_URI: "postgres://fatcat:tactaf@localhost/sandcrawler"
+ PGRST_DB_ANON_ROLE: "fatcat"
+ minio:
+ image: minio/minio
+ ports:
+ - "9000:9000"
+ environment:
+ MINIO_ACCESS_KEY: minioadmin
+ MINIO_SECRET_KEY: minioadmin
+ entrypoint: sh
+ command: -c "mkdir -p /tmp/minio/sandcrawler && mkdir -p /tmp/minio/thumbnail && mkdir -p /tmp/minio/sandcrawler-dev && /usr/bin/minio server /tmp/minio"
+
diff --git a/hbase/howto.md b/extra/hbase/howto.md
index 26d33f4..26d33f4 100644
--- a/hbase/howto.md
+++ b/extra/hbase/howto.md
diff --git a/hbase/notes.txt b/extra/hbase/notes.txt
index 20f406f..20f406f 100644
--- a/hbase/notes.txt
+++ b/extra/hbase/notes.txt
diff --git a/hbase/schema_design.md b/extra/hbase/schema_design.md
index 2db8998..2db8998 100644
--- a/hbase/schema_design.md
+++ b/extra/hbase/schema_design.md
diff --git a/nginx/README.md b/extra/nginx/README.md
index 0369f9b..0369f9b 100644
--- a/nginx/README.md
+++ b/extra/nginx/README.md
diff --git a/nginx/fatcat-blobs b/extra/nginx/fatcat-blobs
index 5c692ef..5c692ef 100644
--- a/nginx/fatcat-blobs
+++ b/extra/nginx/fatcat-blobs
diff --git a/nginx/sandcrawler-db b/extra/nginx/sandcrawler-db
index 67d1a2d..67d1a2d 100644
--- a/nginx/sandcrawler-db
+++ b/extra/nginx/sandcrawler-db
diff --git a/nginx/sandcrawler-minio b/extra/nginx/sandcrawler-minio
index 2e9bfe3..2e9bfe3 100644
--- a/nginx/sandcrawler-minio
+++ b/extra/nginx/sandcrawler-minio
diff --git a/kafka/debugging_issues.txt b/kafka/debugging_issues.txt
index 1af490e..007c786 100644
--- a/kafka/debugging_issues.txt
+++ b/kafka/debugging_issues.txt
@@ -1,4 +1,13 @@
+## 2020-11-12
+
+To reset a consumer group to the offsets from a specific date (or datetime),
+use:
+
+ ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-grobid-s3 --reset-offsets --all-topics --to-datetime 2020-11-09T00:00:00.000
+
+Add `--execute` to actually commit the change.
+
## 2018-12-02
Had been having some troubles with consumer group partition assignments with
diff --git a/kafka/howto_rebalance.md b/kafka/howto_rebalance.md
new file mode 100644
index 0000000..093740a
--- /dev/null
+++ b/kafka/howto_rebalance.md
@@ -0,0 +1,43 @@
+
+## Rebalance Storage Between Brokers (kafka-manager web)
+
+For each topic you want to rebalance (eg, the large or high-throughput ones),
+go to the topic page and do the blue "reassign partitions" button (or
+potentially "generate" or "manual").
+
+Monitor progress with the "Reassign Partitions" link at top of the page.
+
+Finally, run a preferred replica election after partition movement is complete.
+
+## Rebalance Storage Between Brokers (CLI)
+
+For example, after adding or removing brokers from the cluster.
+
+Create a list of topics to move, and put it in `/tmp/topics_to_move.json`:
+
+ {
+ "version": 1,
+ "topics": [
+ {"topic": "sandcrawler-shadow.grobid-output"},
+ {"topic": "fatcat-prod.api-crossref"}
+ ]
+ }
+
+On a kafka broker, go to `/srv/kafka-broker/kafka-*/bin`, generate a plan, then
+inspect the output:
+
+ ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --broker-list "280,281,284,285,263" --topics-to-move-json-file /tmp/topics_to_move.json --generate > /tmp/reassignment-plan.json
+ cat /tmp/reassignment-plan.json | rg '^\{' | head -n1 | jq . > /tmp/old-plan.json
+ cat /tmp/reassignment-plan.json | rg '^\{' | tail -n1 | jq . > /tmp/new-plan.json
+ cat /tmp/reassignment-plan.json | rg '^\{' | jq .
+
+If that looks good, start the rebalance:
+
+ ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --reassignment-json-file /tmp/new-plan.json --execute
+
+Then monitor progress:
+
+ ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --reassignment-json-file /tmp/new-plan.json --verify
+
+Finally, run a preferred replica election after partition movement is complete.
+Currently do this through the web interface (linked above).
diff --git a/kafka/monitoring_commands.md b/kafka/monitoring_commands.md
new file mode 100644
index 0000000..c0c330f
--- /dev/null
+++ b/kafka/monitoring_commands.md
@@ -0,0 +1,4 @@
+
+ kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o end | jq '[.status, .base_url]' -c
+
+ kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o end | jq '[.request.ingest_request_source, .status, .request.base_url, .terminal.terminal_url]' -c
diff --git a/kafka/topics.md b/kafka/topics.md
index 36337da..a699e16 100644
--- a/kafka/topics.md
+++ b/kafka/topics.md
@@ -25,6 +25,63 @@ retention (on both a size and time basis).
=> fewer partitions with batch mode, but still a bunch (24?)
=> key is sha1hex of PDF. enable time compaction (6 months?)
+ sandcrawler-ENV.ingest-file-requests-daily
+ => was ingest-file-requests previously, but renamed/rebalanced
+ => ingest requests from multiple sources; mostly continuous or pseudo-interactive
+ => schema is JSON; see ingest proposal for fields. small objects.
+ => fewer partitions with batch mode, but still a bunch (24)
+ => can't think of a good key, so none. enable time compaction (3-6 months?)
+
+ sandcrawler-ENV.ingest-file-requests-bulk
+ => ingest requests from bulk crawl sources; background processing
+ => same as ingest-file-requests
+
+ sandcrawler-ENV.ingest-file-requests-priority
+ => ingest requests from bulk crawl sources; background processing
+ => same as ingest-file-requests
+
+ sandcrawler-ENV.ingest-file-results
+ => ingest requests from multiple sources
+ => schema is JSON; see ingest proposal for fields. small objects.
+ => 6 partitions
+ => can't think of a good key, so none; no compaction
+
+ sandcrawler-ENV.pdftrio-output
+ => output of each pdftrio ML classification
+ => schema is JSON; see pdftrio proposal for fields. small objects.
+ => 6 partitions
+ => key is sha1hex of PDF; enable key compaction
+
+ sandcrawler-ENV.unextracted
+ => PDF files in IA needing extraction (thumbnails and text)
+ => schema is sandcrawler-db style JSON. Can be either `cdx` or `petabox` object
+ => fewer partitions with batch mode, but still a bunch (12? 24?)
+ => key is sha1hex of PDF. enable time compaction (6 months?)
+
+ sandcrawler-ENV.pdf-text
+ => fulltext (raw text) and PDF metadata for pdfs
+ => schema is JSON; see pdf_meta proposal for fields. large objects.
+ => 12 partitions
+ => key is sha1hex of PDF; enable key compaction; gzip compression
+
+ sandcrawler-ENV.xml-doc
+ => fulltext XML; mostly JATS XML
+ => schema is JSON, with 'jats_xml' field containing the XML as a string
+ => 6 partitions
+ => key is sha1hex of XML document; enable key compaction; gzip compression
+
+ sandcrawler-ENV.html-teixml
+ => extracted fulltext from HTML; mostly TEI-XML
+ => schema is JSON, with 'tei_xml' field containing the XML as a string
+ => 6 partitions
+ => key is sha1hex of source HTML document; enable key compaction; gzip compression
+
+ sandcrawler-ENV.pdf-thumbnail-SIZE-TYPE
+ => thumbnail images (eg, png, jpg) from PDFs
+ => raw bytes in message (no JSON or other wrapping). fields average 10 KByte
+ => 12 partitions; expect a TByte or so total
+ => key is sha1hex of PDF; enable key compaction; no compression
+
fatcat-ENV.api-crossref
fatcat-ENV.api-datacite
=> all new and updated DOIs (regardless of type)
@@ -33,8 +90,15 @@ retention (on both a size and time basis).
=> ~1TB capacity; 8x crossref partitions, 4x datacite
=> key compaction possible
+ fatcat-ENV.ftp-pubmed
+ => new citations from FTP server, from: ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/
+ => raw XML, one record per message (PubmedArticle, up to 25k records/day and 650MB/day)
+ => key: PMID
+ => key compaction possible
+
fatcat-ENV.api-crossref-state
fatcat-ENV.api-datacite-state
+ fatcat-ENV.ftp-pubmed-state
fatcat-ENV.oaipmh-pubmed-state
fatcat-ENV.oaipmh-arxiv-state
fatcat-ENV.oaipmh-doaj-journals-state (DISABLED)
@@ -54,15 +118,28 @@ retention (on both a size and time basis).
=> v03 is newer v0.3.0 API schema (backwards incompatible)
=> key: fcid
=> 8x partitions
- fatcat-ENV.work-updates
- => key: fcid
- => 8x partitions
fatcat-ENV.container-updates
=> key: fcid
=> 4x partitions
fatcat-ENV.file-updates
=> key: fcid
=> 4x partitions
+ fatcat-ENV.work-ident-updates
+ => work identifiers when updated and needs re-indexing (eg, in scholar)
+ => 6x partitions
+ => key: doc ident ("work_{ident}")
+ => key compaction possible; long retention
+
+ scholar-ENV.sim-updates
+ => 6x partitions
+ => key: "sim_item_{}"
+ => key compaction possible; long retention
+ scholar-ENV.update-docs
+ => 12x partitions
+ => key: scholar doc identifer
+ => gzip compression
+ => key compaction possible
+ => short time-based retention (2 months?)
### Deprecated/Unused Topics
@@ -99,19 +176,39 @@ exists`; this seems safe, and the settings won't be over-ridden.
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ungrobided-pg
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.grobid-output-pg --config compression.type=gzip --config cleanup.policy=compact
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.ingest-file-requests-daily --config retention.ms=7889400000 --config cleanup.policy=delete
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.ingest-file-requests-bulk --config retention.ms=7889400000 --config cleanup.policy=delete
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-requests-priority --config retention.ms=7889400000 --config cleanup.policy=delete
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-results
+
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.pdftrio-output --config cleanup.policy=compact
+
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.changelog
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.release-updates-v03
- ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.work-updates
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.file-updates
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.container-updates
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic fatcat-qa.work-ident-updates
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-crossref
- ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-datacite
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-datacite --config cleanup.policy=compact
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.ftp-pubmed --config cleanup.policy=compact
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.api-crossref-state
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.api-datacite-state
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.ftp-pubmed-state
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.oaipmh-pubmed
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.oaipmh-arxiv
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.oaipmh-pubmed-state
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.oaipmh-arxiv-state
+ # only 3 partitions in QA
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.pdf-text --config compression.type=gzip --config cleanup.policy=compact
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.pdf-thumbnail-180px-jpg --config cleanup.policy=compact
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.unextracted
+
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic scholar-qa.sim-updates
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic scholar-qa.update-docs --config compression.type=gzip --config cleanup.policy=compact --config retention.ms=7889400000
+
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.xml-doc --config compression.type=gzip --config cleanup.policy=compact
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.html-teixml --config compression.type=gzip --config cleanup.policy=compact
+
diff --git a/minio/README.md b/minio/README.md
deleted file mode 100644
index 3ce0f95..0000000
--- a/minio/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-
-minio is used as an S3-compatible blob store. Initial use case is GROBID XML
-documents, addressed by the sha1 of the PDF file the XML was extracted from.
-
-Note that on the backend minio is just storing objects as files on disk.
-
-## Buckets
-
-Notable buckets, and structure/naming convention:
-
- grobid/
- 2c/0d/2c0daa9307887a27054d4d1f137514b0fa6c6b2d.tei.xml
- SHA1 (lower-case hex) of PDF that XML was extracted from
- unpaywall/grobid/
- 2c/0d/2c0daa9307887a27054d4d1f137514b0fa6c6b2d.tei.xml
- SHA1 (lower-case hex) of PDF that XML was extracted from
- (mirror of /grobid/ for which we crawled for unpaywall and made publicly accessible)
-
-Create new buckets like:
-
- mc mb sandcrawler/grobid
-
-## Users
-
-Create a new readonly user like:
-
- mc admin user add sandcrawler unpaywall $RANDOM_SECRET_KEY readonly
-
-Make a prefix within a bucket world-readable like:
-
- mc policy set download sandcrawler/unpaywall/grobid
diff --git a/notes/dryad_datasets.md b/notes/dryad_datasets.md
new file mode 100644
index 0000000..5c727b1
--- /dev/null
+++ b/notes/dryad_datasets.md
@@ -0,0 +1,17 @@
+
+api docs: https://datadryad.org/api/v2/docs
+
+current search queries return 38,000 hits (December 2020)
+
+exmaple with multiple versions:
+ https://datadryad.org/stash/dataset/doi:10.5061/dryad.fbg79cnr0
+ https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0
+ https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0/versions
+
+
+how to handle versions? DOI doesn't get incremented.
+
+on archive.org, could have separate item for each version, or sub-directories within item, one for each version
+
+in fatcat, could have a release for each version, but only one with
+the DOI; or could have a separate fileset for each version
diff --git a/notes/examples/2021-11-12_broken_grobid_xml.md b/notes/examples/2021-11-12_broken_grobid_xml.md
new file mode 100644
index 0000000..5223651
--- /dev/null
+++ b/notes/examples/2021-11-12_broken_grobid_xml.md
@@ -0,0 +1,83 @@
+
+Find all the PDFs from web which resulted in `bad-grobid-xml` status code (among others):
+
+ sql> select * from grobid where status != 'success' and status_code != 500 and status_code != 503 and status != 'error-timeout' limit 100;
+
+ sha1hex | updated | grobid_version | status_code | status | fatcat_release | metadata
+ ------------------------------------------+-------------------------------+----------------+-------------+----------------+----------------+------------------------------------------------------------------------
+ d994efeea3b653e2dbe8e13e5a6d203e9b9484ab | 2020-03-20 04:04:40.093094+00 | | 200 | error | | {"error_msg": "response XML too large: 12052192 bytes"}
+ 8dadf846488ddc2ff3934dd6beee0e3046fa3800 | 2020-11-24 01:24:02.668692+00 | | 200 | error | | {"error_msg": "response XML too large: 18758248 bytes"}
+ 227900724e5cf9fbd06146c914239d0c12c3671a | 2020-03-18 10:24:33.394339+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20200210041053/https://pdfs.semanticscholar.org/2279/00724e5cf9fbd06146c914239d0c12c3671a.pdf
+ FIXED
+ f667b4ef2befb227078169ed57ffc6efc5fa85c2 | 2020-03-20 04:54:18.902756+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 527"}
+ https://web.archive.org/web/20200218182411/https://pdfs.semanticscholar.org/f667/b4ef2befb227078169ed57ffc6efc5fa85c2.pdf
+ FIXED
+ c1e8d9df347b8de53fc2116615b1343ba327040d | 2020-11-08 21:46:04.552442+00 | | 200 | bad-grobid-xml | | {"error_msg": "mismatched tag: line 198, column 3"}
+ https://web.archive.org/web/20200904163312/https://arxiv.org/pdf/1906.02107v1.pdf
+ FIXED (and good)
+ 4d9860a5eeee6bc671c3be859ca78f89669427f0 | 2021-11-04 01:29:13.081596+00 | | 200 | bad-grobid-xml | | {"error_msg": "unclosed token: line 812, column 7"}
+ https://web.archive.org/web/20211104012833/https://actabalneologica.eu/wp-content/uploads/library/ActaBalneol2021i3.pdf
+ FIXED
+ metadata quality mixed, but complex document (?)
+ 7cfc0739be9c49d94272110a0a748256bdde9be6 | 2021-07-25 17:06:03.919073+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 38, column 440"}
+ https://web.archive.org/web/20210716124436/https://jsesd.csers-ly.com/index.php/jsesd/article/download/28/23
+ FIXED
+ 088c61a229084d13f85524efcc9f38a80dd19caf | 2021-09-01 08:08:18.531533+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 47, column 814"}
+ https://web.archive.org/web/20210814181328/https://wmrj.areeo.ac.ir/article_120843_3806466cb1f5a125c328f99866751a43.pdf
+ FIXED
+ 19e70297e523e9f32cd4379af33a12ab95c34a71 | 2021-11-05 10:09:25.407657+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 853, column 84"}
+ not found
+ acc855d74431537b98de5185e065e4eacbab7b26 | 2021-11-12 22:57:22.439007+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 60, column 45"}
+ https://web.archive.org/web/20211111182756/https://arxiv.org/pdf/2006.13365v5.pdf
+ BROKEN: not well-formed (invalid token): line 60, column 45
+ <note type="raw_affiliation"><label>&</label> Fraunhofer IAIS, Sankt Augustin and Dresden, Germany.</note>
+ 8e73055c63d1e684b59059ac418f55690a2eec01 | 2021-11-12 17:34:46.343685+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 44, column 45"}
+ not found
+ c2b3f696e97b9e80f38c35aa282416e95d6d9f5e | 2021-11-12 22:57:12.417191+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 58, column 45"}
+ https://web.archive.org/web/20211112051714/https://ccsenet.org/journal/index.php/gjhs/article/download/0/0/46244/49308
+ BROKEN: not well-formed (invalid token): line 58, column 45
+ <note type="raw_affiliation"><label>&</label> Ren, 2020; Meng, Hua, &amp; Bian, 2020).</note>
+ 840d4609308c4a7748393181fe1f6a45f9d425c5 | 2021-11-12 22:57:17.433022+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 1824, column 45"}
+ not found
+ 3deb6375e894c5007207502bf52d751a47a20725 | 2021-11-12 23:11:17.711948+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 65, column 45"}
+ not found
+ f1d06080a4b1ac72ab75226e692e8737667c29a7 | 2020-01-16 09:23:27.579995+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 29, column 1581"}
+ https://web.archive.org/web/20180721030918/https://journals.squ.edu.om/index.php/jams/article/download/650/649
+ FIXED, good
+ f3e7b91fce9132addc59bd1560c5eb16c0330842 | 2020-01-12 11:58:06.654613+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20180426020051/http://jhsw.tums.ac.ir/article-1-5121-en.pdf
+ FIXED
+ 37edcaa6f67fbb8c3e27fa02da4f0fa780e33bca | 2020-01-04 21:53:49.578847+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 1284"}
+ https://web.archive.org/web/20180510115632/http://www.fmreview.org/sites/fmr/files/FMRdownloads/ar/detention/majidi.pdf
+ FIXED
+ 3f1d302143824808f7109032687a327708896748 | 2020-01-05 20:51:18.783034+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20180428082655/http://jhsw.tums.ac.ir/browse.php?a_id=5121&sid=1&slc_lang=fa&ftxt=1
+ FIXED
+ (21 rows)
+
+Some other errors from other queries:
+
+ d9634f194bc3dee27db7a1cb49b30e48803d7ad8 | 2020-01-06 16:01:09.331272+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/VyuJWqREHT.lxml"}
+ https://web.archive.org/web/20190304092121/http://pdfs.semanticscholar.org/d963/4f194bc3dee27db7a1cb49b30e48803d7ad8.pdf
+ FIXED: with 0.7.0+
+
+ 56c9b5398ef94df54d699342740956caf4523925 | 2020-02-06 21:37:42.139761+00 | | 500 | error | | {"error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1"}
+ https://web.archive.org/web/20080907000756/http://www.rpi.edu/~limc/poster_ding.pdf
+ still errors: "error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1", "status": "error", "status_code": 500
+ BAD PDF ("no pages" in evince)
+
+ d7cf65ed211cf1e3420c595fdbecc5d18f297b11 | 2020-01-10 23:19:16.783415+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/dBV73X4HrZ.lxml"}
+ https://web.archive.org/web/20170812074846/http://dspace.utpl.edu.ec/bitstream/123456789/7918/1/Tesis_de_Jacome_Valdivieso_Soraya_Stephan%c3%ada.pdf
+ FIXED
+
+ 51d070ab398a8744286ef7356445f0828a9f3abb | 2020-02-06 16:01:23.98892+00 | | 503 | error | | {"error_msg": "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"/>\n<t
+ https://web.archive.org/web/20191113160818/http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC2082155&blobtype=pdf
+ FIXED
+
+In summary, there are still a small number of `bad-grobid-xml` cases, and still
+many "very large PDF" cases. But we should probably broadly retry everything,
+especially the 503 errors (from when GROBID is simply down/unavailable).
+
+The `bad-grobid-xml` cases here were all from "<label>" in raw affiliations,
+which I have submitted a patch/PR for.
diff --git a/notes/examples/dataset_examples.txt b/notes/examples/dataset_examples.txt
new file mode 100644
index 0000000..3a04750
--- /dev/null
+++ b/notes/examples/dataset_examples.txt
@@ -0,0 +1,52 @@
+
+### ArchiveOrg: CAT dataset
+
+<https://archive.org/details/CAT_DATASET>
+
+`release_36vy7s5gtba67fmyxlmijpsaui`
+
+###
+
+<https://archive.org/details/academictorrents_70e0794e2292fc051a13f05ea6f5b6c16f3d3635>
+
+doi:10.1371/journal.pone.0120448
+
+Single .rar file
+
+### Dataverse
+
+<https://dataverse.rsu.lv/dataset.xhtml?persistentId=doi:10.48510/FK2/IJO02B>
+
+Single excel file
+
+### Dataverse
+
+<https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+doi:10.7910/DVN/CLSFKX
+
+Mulitple files; multiple versions?
+
+API fetch: <https://dataverse.harvard.edu/api/datasets/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+ .data.id
+ .data.latestVersion.datasetPersistentId
+ .data.latestVersion.versionNumber, .versionMinorNumber
+ .data.latestVersion.files[]
+ .dataFile
+ .contentType (mimetype)
+ .filename
+ .filesize (int, bytes)
+ .md5
+ .persistendId
+ .description
+ .label (filename?)
+ .version
+
+Single file inside: <https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB>
+
+Download single file: <https://dataverse.harvard.edu/api/access/datafile/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB> (redirects to AWS S3)
+
+Dataverse refs:
+- 'doi' and 'hdl' are the two persistentId styles
+- file-level persistentIds are optional, on a per-instance basis: https://guides.dataverse.org/en/latest/installation/config.html#filepidsenabled
diff --git a/notes/examples/html_test_journals.txt b/notes/examples/html_test_journals.txt
new file mode 100644
index 0000000..540dc9f
--- /dev/null
+++ b/notes/examples/html_test_journals.txt
@@ -0,0 +1,153 @@
+
+Good examples of journals to run HTML fulltext extraction on.
+
+## Live Web
+
+d-lib magazine
+ live web
+ no longer active
+ http://www.dlib.org/back.html
+
+NLM technical bulletin
+ https://www.nlm.nih.gov/pubs/techbull/back_issues.html
+
+Genders
+ https://web.archive.org/web/20141227010240/http://www.genders.org:80/index.html
+
+firstmondays
+ live web; now OJS
+
+outhistory.org
+
+http://journal.sjdm.org/
+
+http://whoosh.org/
+
+
+## Vanished (but wayback coverage)
+
+ohmylittledata
+ issn:2551-1289
+ vanished
+ blog format
+ http://web.archive.org/web/20180421061156/https://ohmylittledata.com/
+
+exquisit corpse
+ https://web.archive.org/web/20080521052400/http://corpse.org:80/
+
+Journal of Mundane Behavior
+ https://fatcat.wiki/container/tjwfvrjlunf25ofegccgjjmvya
+ ISSN: 1529-3041
+
+ defunct since ~2010
+ simple HTML articles
+ references
+ http://web.archive.org/web/20100406162007/http:/mundanebehavior.org/index2.htm
+ http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm
+
+War Crimes
+
+ PDF articles (not HTML)
+ http://web.archive.org/web/20120916035741/http:/www.war-crimes.org/
+
+
+## DOAJ Test Articles (HTML)
+
+ zcat doaj_article_data_2020-08-07.json.gz | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.txt
+ => 2,184,954
+
+ cut -f3 -d/ html_fulltext_urls.txt | sort | uniq -c | sort -nr | head -n25
+ 254817 link.springer.com
+ 145159 www.scielo.br
+ 78044 journal.frontiersin.org
+ 77394 www.frontiersin.org
+ 40849 www.dovepress.com
+ 19024 dergipark.org.tr
+ 18758 periodicos.ufsc.br
+ 16346 www.revistas.usp.br
+ 15872 revistas.unal.edu.co
+ 15527 revistas.ucm.es
+ 13669 revistas.usal.es
+ 12640 dergipark.gov.tr
+ 12111 journals.rudn.ru
+ 11839 www.scielosp.org
+ 11277 www.karger.com
+ 10827 www.journals.vu.lt
+ 10318
+ 9854 peerj.com
+ 9100 ojs.unud.ac.id
+ 8581 jurnal.ugm.ac.id
+ 8261 riviste.unimi.it
+ 8012 journals.uran.ua
+ 7454 revistas.pucp.edu.pe
+ 7264 journals.vgtu.lt
+ 7200 publicaciones.banrepcultural.org
+
+ cat html_fulltext_urls.txt \
+ | rg -v link.springer.com \
+ | rg -v scielo \
+ | rg -v dergipark.gov.tr \
+ | rg -v frontiersin.org \
+ > html_fulltext_urls.filtered.txt
+ => 1,579,257
+
+ zcat doaj_article_data_2020-08-07.json.gz | rg -v '"doi"' | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.no_doi.txt
+ => 560k
+
+ cut -f3 -d/ html_fulltext_urls.no_doi.txt | sort | uniq -c | sort -nr | head -n25
+ 40849 www.dovepress.com
+ 10570 journals.rudn.ru
+ 10494 dergipark.org.tr
+ 10233 revistas.unal.edu.co
+ 9981 dergipark.gov.tr
+ 9428 revistas.usal.es
+ 8292 revistas.ucm.es
+ 7200 publicaciones.banrepcultural.org
+ 6953 revistas.pucp.edu.pe
+ 6000 www.scielosp.org
+ 5962 www.scielo.br
+ 5621 www.richtmann.org
+ 5123 scielo.sld.cu
+ 5067 ojs.unud.ac.id
+ 4838 periodicos.ufsc.br
+ 4736 revistasonlinepre.inap.es
+ 4486 journal.fi
+ 4221 www.seer.ufu.br
+ 3553 revistas.uam.es
+ 3492 revistas.pucsp.br
+ 3060 www.scielo.org.co
+ 2991 scielo.isciii.es
+ 2802 seer.ufrgs.br
+ 2692 revistas.unc.edu.ar
+ 2685 srl.si
+
+ cat html_fulltext_urls.no_doi.txt \
+ | rg -v link.springer.com \
+ | rg -v scielo \
+ | rg -v dergipark.gov.tr \
+ | rg -v frontiersin.org \
+ > html_fulltext_urls.no_doi.filtered.txt
+ => 518,608
+
+ zcat doaj_articles_2020-08-07.html_fulltext_urls.no_doi.filtered.txt.gz | shuf -n20
+ https://revistas.unc.edu.ar/index.php/revistaEF/article/view/22795
+ https://journal.umy.ac.id/index.php/st/article/view/3297
+ https://www.unav.edu/publicaciones/revistas/index.php/estudios-sobre-educacion/article/view/23442
+ http://publications.muet.edu.pk/research_papers/pdf/pdf1615.pdf
+ http://revistas.uncu.edu.ar/ojs/index.php/revistaestudiosclasicos/article/view/1440
+ https://journal.fi/inf/article/view/59430
+ http://journal.uii.ac.id/index.php/Eksakta/article/view/2429
+ https://www.dovepress.com/infant-sleep-and-its-relation-with-cognition-and-growth-a-narrative-re-peer-reviewed-article-NSS
+ https://revistasonlinepre.inap.es/index.php/REALA/article/view/9157
+ http://dergipark.org.tr/dubited/issue/27453/299047?publisher=duzce
+ http://revistas.pucp.edu.pe/index.php/themis/article/view/11862
+ http://journal.bdfish.org/index.php/fisheries/article/view/91
+ https://ojs.unud.ac.id/index.php/buletinfisika/article/view/30567
+ https://www.lithosphere.ru/jour/article/view/779
+ https://journals.hioa.no/index.php/seminar/article/view/2412
+ http://revistas.unicauca.edu.co/index.php/rfcs/article/view/197
+ https://www.kmuj.kmu.edu.pk/article/view/15698
+ http://forodeeducacion.com/ojs/index.php/fde/article/view/82
+ https://revistas.unc.edu.ar/index.php/ConCienciaSocial/article/view/19941
+ http://grbs.library.duke.edu/article/view/3361
+
diff --git a/notes/examples/random_datasets.md b/notes/examples/random_datasets.md
new file mode 100644
index 0000000..b69132c
--- /dev/null
+++ b/notes/examples/random_datasets.md
@@ -0,0 +1,19 @@
+
+Possible external datasets to ingest (which are not entire platforms):
+
+- https://research.google/tools/datasets/
+- https://openslr.org/index.html
+- https://www.kaggle.com/datasets?sort=votes&tasks=true
+- https://archive.ics.uci.edu/ml/datasets.php
+
+Existing archive.org datasets to ingest:
+
+- https://archive.org/details/allthemusicllc-datasets
+
+Papers on archive.org to ingest:
+
+- <https://archive.org/details/journals?and%5B%5D=%21collection%3Aarxiv+%21collection%3Ajstor_ejc+%21collection%3Apubmed&sin=>
+- <https://archive.org/details/biorxiv>
+- <https://archive.org/details/philosophicaltransactions?tab=collection>
+- <https://archive.org/search.php?query=doi%3A%2A>
+- <https://archive.org/details/folkscanomy_academic>
diff --git a/notes/fuzzy_match_notes.md b/notes/fuzzy_match_notes.md
new file mode 100644
index 0000000..a87364c
--- /dev/null
+++ b/notes/fuzzy_match_notes.md
@@ -0,0 +1,148 @@
+
+These are notes on how bibliographic metadata matches (of records) and
+slugification (to create lookup keys on title strings) worked in the past in
+the sandcrawler repository. Eg, circa 2018.
+
+## Scala Slug-ification
+
+Original title strings longer than 1023 characters were rejected (before
+slug-ifying).
+
+There was a "slug-denylist". Additionally, scorable strings needed to be
+between 8 and 1023 characters (not bytes) long (inclusive)
+
+Slugification transform was:
+
+- lower-case
+- remove whitespace ("\s")
+- strip specific accent characters:
+ '\u0141' -> 'L',
+ '\u0142' -> 'l', // Letter ell
+ '\u00d8' -> 'O',
+ '\u00f8' -> 'o'
+- remove all '\p{InCombiningDiacriticalMarks}'
+- remove punctuation:
+ \p{Punct}
+ ’·“â€â€˜â€™â€œâ€Â«Â»ã€Œã€Â¿â€“±§
+
+Partially adapted from apache commons: <https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934>
+
+My original notes/proposal:
+
+1. keep only \p{Ideographic}, \p{Alphabetic}, and \p{Digit}
+2. strip accents
+3. "lower-case" (unicode-aware)
+4. do any final custom/manual mappings
+
+Resulting slugs less than 8 characters long were rejected, and slugs were
+checked against a denylist.
+
+Only 554 entries in the denylist; could just ship that in the library.
+
+
+## Python Tokenization
+
+- "&apos;" -> "'"
+- remove non "isalnum()" characters
+- encode as ASCII; this removes diacritics etc, but also all non-latin character sets
+- optionally remove all whitespace
+
+
+## Python GROBID Cleanups
+
+These are likely pretty GROBID-specific. Article title was required, but any of
+the other filtered-out fields just resulted in partial metadata. These filters
+are the result of lots of manual verification of results, and doing things like
+taking truncating titles and looking at the most popular prefixes for a large
+random sample.
+
+Same denylist for title slugs as Scala, plus:
+
+ editorial
+ advertisement
+ bookreviews
+ reviews
+ nr
+ abstractoriginalarticle
+ originalarticle
+ impactfactor
+ articlenumber
+
+Other filters on title strings (any of these bad):
+
+- 500 or more characters long
+- tokenized string less than 10 characters
+- tokenized starts with 'nr' or 'issn'
+- lowercase starts with 'int j' or '.int j'
+- contains both "volume" and "issue"
+- contains "downloadedfrom"
+- fewer than 2 or more than 50 tokens (words)
+- more than 12 tokens only a single character long
+- more than three ":"; more than one "|"; more than one "."
+
+Remove title prefixes (but allow):
+
+ "Title: "
+ "Original Article: "
+ "Original Article "
+ "Article: "
+
+Denylist for authors:
+
+ phd
+ phdstudent
+
+Journal name processing:
+
+- apply title denylist
+- remove prefixes
+ characters: /~&©
+ Original Research Article
+ Original Article
+ Research Article
+ Available online www.jocpr.com
+- remove suffixes
+ Available online at www.sciarena.com
+ Original Article
+ Available online at
+ ISSN
+ ISSUE
+- remove anywhere
+ e-ISSN
+ p-ISSN
+
+## Python Grouping Comparison
+
+Would consume joined groups, row-by-row. At most 10 matches per group; any more
+and skip (this was for file-to-release).
+
+Overall matching requirements:
+
+- string similarity threshold from scala code
+ https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
+ https://stackoverflow.com/questions/955110/similarity-string-comparison-in-java/16018452#16018452
+- authors should be consistent
+ - convert one author list into space-separated tokens
+ - remove "jr." from all author token lists
+ - the last word for each author full name in the other list (eg, the lastname),
+ tokenized, must be in the token set
+- if both years defined, then must match exactly (integers)
+
+In the code, there is a note:
+
+ Note: the actual importer/merger should filter the following patterns out:
+ - container title has "letter" and "diar"
+ - contribs (authors) contain "&NA;"
+ - dates differ (not just year)
+
+
+## Scala Metadata Keys
+
+Only the titles were ever actually used (in scala), but the keys allowed were:
+
+- title
+- authors (list of strings)
+- year (int)
+- contentType
+- doi
+
diff --git a/notes/grobid_munging.txt b/notes/grobid_munging.txt
new file mode 100644
index 0000000..013e458
--- /dev/null
+++ b/notes/grobid_munging.txt
@@ -0,0 +1,70 @@
+
+In docker:
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.grobid-output-pg | pv -l | rg 'OA-JOURNAL-CRAWL-2019-08' > OA-JOURNAL-CRAWL-2019-08.grobid.json
+ # 5.01M 0:31:04 [2.69k/s]
+ # 277 GByte grobid-output.prod.json
+
+Then:
+
+ cat grobid-output.prod.json | rg 'OA-JOURNAL-CRAWL-2019-08' | pv -l > OA-JOURNAL-CRAWL-2019-08.grobid.json
+ # 265k 0:32:12 [ 137 /s]
+
+ pigz grobid-output.prod.json
+ # 63 GByte grobid-output.prod.json.gz
+
+ cat OA-JOURNAL-CRAWL-2019-08.grobid.json | pv -l | jq "[.key, .status, .status_code, .error_msg] | @tsv" -r | sort -u -S 4G | uniq --check-chars 40 > OA-JOURNAL-CRAWL-2019-08.grobid.tsv
+ # 265k
+
+ wc -l OA-JOURNAL-CRAWL-2019-08.grobid.tsv
+ # 212879 OA-JOURNAL-CRAWL-2019-08.grobid.tsv
+
+ cut -f2 OA-JOURNAL-CRAWL-2019-08.grobid.tsv | sort | uniq -c
+ # 14087 error
+ # 198792 success
+
+In sandcrawler pipenv:
+
+ head -n100 /grande/oa-crawl-grobid/OA-JOURNAL-CRAWL-2019-08.grobid.json | ./grobid_tool.py transform --metadata-only - > /grande/oa-crawl-grobid/OA-JOURNAL-CRAWL-2019-08.metadata.json.sample
+
+ cat /grande/oa-crawl-grobid/OA-JOURNAL-CRAWL-2019-08.grobid.json | parallel --linebuffer --round-robin --pipe -j8 ./grobid_tool.py transform --metadata-only - > /grande/oa-crawl-grobid/OA-JOURNAL-CRAWL-2019-08.metadata.json
+
+ cat OA-JOURNAL-CRAWL-2019-08.metadata.json | rg -v '"fatcat_release": null' > OA-JOURNAL-CRAWL-2019-08.metadata.matched.json
+
+ wc -l OA-JOURNAL-CRAWL-2019-08.metadata.matched.json OA-JOURNAL-CRAWL-2019-08.grobid.tsv
+ # 28162 OA-JOURNAL-CRAWL-2019-08.metadata.matched.json
+ # 212879 OA-JOURNAL-CRAWL-2019-08.grobid.tsv
+
+Next steps:
+- import the matched files (while verifying match)
+- some web interface to make sandcrawler easier?
+ input: sha1 or url
+ view: grobid status and metadata, ML results, fatcat metadata (via API lookup)
+ links/actions: view PDF, re-run GROBID, add to a release (via API)
+
+## BAD/BROKEN
+
+All these following didn't work because old versions of kafkacat only read
+partial results. Ended up using docker to run more recent ubuntu, sigh.
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.grobid-output-pg -e | pv -l > grobid-output.prod.json
+
+ cat grobid-output.prod.json | rg '"status": "success"' > grobid-output.prod.success.json
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.grobid-output-pg -e | pv -l | rg '"status": "success"' > grobid-output.prod.success.json
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.grobid-output-pg -e | pv -l | rg 'OA-JOURNAL-CRAWL-2019-08' > OA-JOURNAL-CRAWL-2019-08.grobid.json
+
+ head -n200 /grande/oa-crawl-grobid/grobid-output.prod.success.json | ./grobid_tool.py transform --metadata-only - | jq "[.fatcat_release, .biblio.title]" -c | less
+
+
+ cat OA-JOURNAL-CRAWL-2019-08.grobid.json | parallel --pipe -j8 jq .status -r | sort | uniq -c
+ 1879 error
+ 26698 success
+
+
+For full grobid-output was looking like:
+
+ 318561 error
+ 199607 success
+
diff --git a/notes/hadoop_job_log.md b/notes/hadoop_job_log.md
new file mode 100644
index 0000000..f812c0a
--- /dev/null
+++ b/notes/hadoop_job_log.md
@@ -0,0 +1,210 @@
+
+### QA matchcrossref
+
+[D8C7F2CA7620450991838D540489948D/8B17786779BE44579C98D8A325AC5959] sandcrawler.ScoreJob/(1/1) ...-24-2102.32-matchcrossref
+
+Submitted: Fri Aug 24 21:03:09 UTC 2018
+Started: Fri Aug 24 21:03:20 UTC 2018
+Finished: Sat Aug 25 09:46:55 UTC 2018
+Elapsed: 12hrs, 43mins, 34sec
+Diagnostics:
+Average Map Time 24mins, 31sec
+Average Shuffle Time 15sec
+Average Merge Time 21sec
+Average Reduce Time 7mins, 17sec
+
+Map 2312 2312
+Reduce 100 100
+
+crossref-rows-filtered 73901964 0 73901964
+grobid-rows-filtered 1092992 0 1092992
+joined-rows 0 623837 623837
+
+cascading.flow.StepCounters
+Tuples_Read 94831255 0 94831255
+Tuples_Written 0 623837 623837
+
+Read_Duration 7108430 352241 7460671
+Tuples_Read 94831255 74994956 169826211
+Tuples_Written 74994956 623837 75618793
+Write_Duration 7650302 21468 7671770
+
+## QA UnGrobided
+
+Submitted: Sat Aug 25 01:23:22 UTC 2018
+Started: Sat Aug 25 05:06:36 UTC 2018
+Finished: Sat Aug 25 05:13:45 UTC 2018
+Elapsed: 7mins, 8sec
+Diagnostics:
+Average Map Time 1mins, 20sec
+Average Shuffle Time 12sec
+Average Merge Time 15sec
+Average Reduce Time 29sec
+
+Map 48 48
+Reduce 1 1
+
+bnewbold@bnewbold-dev$ gohdfs du -sh sandcrawler/output-qa/2018-08-25-0122.54-dumpungrobided/part*
+56.8M /user/bnewbold/sandcrawler/output-qa/2018-08-25-0122.54-dumpungrobided/part-00000
+
+## Prod UnGrobided
+
+[D76F6BF91D894E879E747C868B0DEDE7/394A1AFC44694992B71E6920AF8BA3FB] sandcrawler.DumpUnGrobidedJob/(1/1) ...26-0910.25-dumpungrobided
+
+Map 278 278
+Reduce 1 1
+
+Submitted: Sun Aug 26 09:10:51 UTC 2018
+Started: Sun Aug 26 09:18:21 UTC 2018
+Finished: Sun Aug 26 10:29:28 UTC 2018
+Elapsed: 1hrs, 11mins, 7sec
+Diagnostics:
+Average Map Time 4mins, 48sec
+Average Shuffle Time 24mins, 17sec
+Average Merge Time 14sec
+Average Reduce Time 13mins, 54sec
+
+
+cading.flow.StepCounters
+Name
+Map
+Reduce
+Total
+Tuples_Read 64510564 0 64510564
+Tuples_Written 0 21618164 21618164
+
+## Prod Crossref Match
+
+[6C063C0809244446BA8602C3BE99CEC2/5FE5D87899154F38991A1ED58BEB34D4] sandcrawler.ScoreJob/(1/1) ...-25-1753.01-matchcrossref
+
+Map 2427 2427
+Reduce 50 50
+
+Submitted: Sat Aug 25 17:53:50 UTC 2018
+Started: Sat Aug 25 17:53:59 UTC 2018
+Finished: Sun Aug 26 11:22:52 UTC 2018
+Elapsed: 17hrs, 28mins, 52sec
+Diagnostics:
+Average Map Time 31mins, 20sec
+Average Shuffle Time 1mins, 21sec
+Average Merge Time 41sec
+Average Reduce Time 3hrs, 14mins, 39sec
+
+crossref-rows-filtered 73901964 0 73901964
+grobid-rows-filtered 14222226 0 14222226
+joined-rows 0 14115453 14115453
+
+## "Prod" Fatcat Group Works (run 2019-08-10)
+
+ ./please --prod groupworks-fatcat hdfs:///user/bnewbold/release_export.2019-07-07.json
+
+ job_1559844455575_118299
+ http://ia802401.us.archive.org:6988/proxy/application_1559844455575_118299
+
+## Re-GROBID batch (2019-11-12)
+
+Want to re-process "old" GROBID output with newer (0.5.5+fatcat) GROBID version
+(vanilla training) plus biblio-glutton identification. Hoping to make a couple
+million new fatcat matches; will probably do a later round of ML matching over
+this batch as well.
+
+ # in /grande/regrobid
+
+ # as postgres
+ psql sandcrawler < dump_regrobid_pdf.sql > dump_regrobid_pdf.txt
+
+ # as bnewbold
+ cat dump_regrobid_pdf.txt | sort -S 4G | uniq -w 40 | cut -f2 | pv -l > dump_regrobid_pdf.2019-11-12.json
+ # 41.5M lines, uniq by SHA1
+ # NOTE: not the full 56m+ from GROBID table... some in archive.org, others
+ # not application/pdf type. Will need to follow-up on those later
+
+ # intend to have 3 worker machines, but splitting 6 ways in case we need to
+ # re-balance load or get extra machines or something
+ split -n l/6 -a1 -d --additional-suffix=.json dump_regrobid_pdf.2019-11-12.json regrobid_cdx.split_
+
+ # distribute to tmp001, tmp002, tmp003:
+ tmp001: 0,1
+ tmp002: 2,3
+ tmp003: 4,5
+
+ # test local grobid config:
+ head /srv/sandcrawler/tasks/regrobid_cdx.split_0.json | pv -l | ./grobid_tool.py --grobid-host http://localhost:8070 -j0 extract-json - > example_out.json
+ # expect at least a couple fatcat matches
+ cat example_out.json | jq .tei_xml -r | rg fatcat
+
+ # test GROBID+kafka config:
+ cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | head | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+ # full run, in a screen session
+ cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+NOTE: really should get parallel kafka worker going soon. if there is a reboot
+or something in the middle of this process, will need to re-run from the start.
+
+Was getting a bunch of weird kafka INVALID_MSG errors on produce. Would be nice to be able to retry, so doing:
+
+ cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel --joblog regrobid_job.log --retries 5 -j40 --linebuffer --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+Never mind, going to split into chunks which can be retried.
+
+ cd /srv/sandcrawler/tasks
+ sudo chown sandcrawler:staff .
+ cat regrobid_cdx.split_* | split -l 20000 -a4 -d --additional-suffix=.json - chunk_
+ ls /srv/sandcrawler/tasks/chunk_*.json | parallel -j4 ./extract_chunk.sh {}
+
+extract_chunk.sh:
+
+
+ #!/bin/bash
+
+ set -x -e -u -o pipefail
+
+ if [ -f $1.SUCCESS ]; then
+ echo "Skipping: $1..."
+ exit
+ fi
+
+ echo "Extracting $1..."
+
+ date
+ cat $1 | parallel -j10 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+ touch $1.SUCCESS
+
+seems to be working better! tested and if there is a problem with one chunk the others continue
+
+## Pig Joins (around 2019-12-24)
+
+Partial (as a start):
+
+ pig -param INPUT_CDX="/user/bnewbold/pdfs/gwb-pdf-20191005172329" -param INPUT_DIGEST="/user/bnewbold/scihash/shadow.20191222.sha1b32.sorted" -param OUTPUT="/user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx" join-cdx-sha1.pig
+
+ HadoopVersion PigVersion UserId StartedAt FinishedAt Features
+2.6.0-cdh5.11.2 0.12.0-cdh5.0.1 bnewbold 2019-12-27 00:39:38 2019-12-27 15:32:44 HASH_JOIN,ORDER_BY,DISTINCT,FILTER
+
+ Success!
+
+ Job Stats (time in seconds):
+ JobId Maps Reduces MaxMapTime MinMapTIme AvgMapTime MedianMapTime MaxReduceTime MinReduceTime AvgReduceTime MedianReducetime Alias Feature Outputs
+ job_1574819148370_46540 4880 0 143 10 27 21 n/a n/a n/a n/a cdx MAP_ONLY
+ job_1574819148370_46541 19 0 59 9 25 18 n/a n/a n/a n/a digests MAP_ONLY
+ job_1574819148370_46773 24 1 17 7 10 9 6 6 6 6 digests SAMPLER
+ job_1574819148370_46774 7306 1 55 4 7 7 25 25 25 25 cdx SAMPLER
+ job_1574819148370_46778 7306 40 127 8 18 15 4970 1936 2768 2377 cdx ORDER_BY
+ job_1574819148370_46779 24 20 80 24 60 66 90 26 38 37 digests ORDER_BY
+ job_1574819148370_46822 22 3 101 27 53 48 1501 166 735 539 DISTINCT
+ job_1574819148370_46828 7146 959 122 7 16 14 91 21 35 32 full_join,result HASH_JOIN /user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx,
+
+ Input(s):
+ Successfully read 1968654006 records (654323590996 bytes) from: "/user/bnewbold/pdfs/gwb-pdf-20191005172329"
+ Successfully read 74254196 records (2451575849 bytes) from: "/user/bnewbold/scihash/shadow.20191222.sha1b32.sorted"
+
+ Output(s):
+ Successfully stored 0 records in: "/user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx"
+
+Oops! Didn't upper-case the sha1b32 output.
+
+Full GWB:
+
+ pig -param INPUT_CDX="/user/bnewbold/pdfs/gwb-pdf-20191005172329" -param INPUT_DIGEST="/user/bnewbold/scihash/shadow.20191222.sha1b32.sorted" -param OUTPUT="/user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx" join-cdx-sha1.pig
diff --git a/notes/html_ingest_notes.md b/notes/html_ingest_notes.md
new file mode 100644
index 0000000..a1a91f3
--- /dev/null
+++ b/notes/html_ingest_notes.md
@@ -0,0 +1,318 @@
+
+## Current Plan
+
+- selectolax to extract metadata and quickly filter (speed)
+ => eg, differentiate landing pages from fulltext
+ => also embed URLs?
+- trafilatura for fulltext body extract
+- no solution yet for reference parsing
+ => maybe trafilatura XML-TEI parsing, then GROBID?
+ => especially if DOI/identifier/URL is in the reference
+
+
+
+TODO:
+x print/wrap error condition better
+x serialize dates (pydantic)
+x CDX lookup "closest" to capture datetime (or by month)
+x firstmonday no extracted fulltext/XML
+x apply URL base fixup to fulltext URLs
+x XML alternative detection
+x basic ingest worker, kafka topics, persist workers, sql table, etc
+- ingest worker: landing page to actual fulltext (eg, OJS)
+- broken? https://betterexplained.com/articles/colorized-math-equations/
+
+Ponder:
+- CDX lookup older successful captures
+ http://www.altdevblogaday.com/2011/05/17/understanding-the-fourier-transform/
+ => optional filter by status? "reduce" by month/year?
+- detect scope heuristically
+ bepress_is_article_cover_page 1
+ citation_fulltext_world_readable "" (eg, distill)
+- non-success subresource fetches
+ https://www.europenowjournal.org/2020/10/11/a-social-history-of-early-rock-n-roll-in-germany-hamburg-from-burlesque-to-the-beatles-1956-1969/
+- redirects: keep start URL?
+
+Later:
+- XML URL extraction
+ https://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-19652002000200001&lng=en&nrm=iso&tlng=pt
+ <a href="http://www.scielo.br/scieloOrg/php/articleXML.php?pid=S0100-19652002000200001&amp;lang=en" rel="nofollow" target="xml">
+- selectolax bug? hangs: `css_first("meta['thing']")`
+- youtube embed
+ => download/include actual video file?
+- parse references in citation headers
+- try parsing references in HTML fulltext
+
+## Testing URLs
+
+- PLOS
+ https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0093949
+ TODO: "May 9, 2014"
+ TODO: appendix
+- peerj
+ https://peerj.com/articles/4375/
+- scielo
+ http://scielo.iics.una.py/scielo.php?script=sci_arttext&pid=S1683-98032020000200081&lng=en&nrm=iso&tlng=es
+ bunch of little icon .png, but ok
+ redirect of an image not saved in webcapture
+- wordpress
+ https://www.europenowjournal.org/2020/10/11/a-social-history-of-early-rock-n-roll-in-germany-hamburg-from-burlesque-to-the-beatles-1956-1969/
+ no HTML meta? hrm
+- old OJS
+ (pdf only) http://rjh.folium.ru/index.php/rjh/article/view/1511
+- new OJS
+ https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729
+- plain HTML
+ http://journal.sjdm.org/12/12627/jdm12627.html
+- blogs/essays
+ http://symbolflux.com/lodessay/
+ https://betterexplained.com/articles/colorized-math-equations/
+ https://web.archive.org/web/20120418231513/http://www.altdevblogaday.com/2011/05/17/understanding-the-fourier-transform/
+ https://research.google.com/bigpicture/attacking-discrimination-in-ml/
+ http://www.econgraphs.org/
+- journal homepage (not fulltext)
+- OJS new landing page (not fulltext)
+- OJS old (not fulltext)
+ http://rjh.folium.ru/index.php/rjh/index
+ http://rjh.folium.ru/index.php/rjh/issue/view/106
+ http://rjh.folium.ru/index.php/rjh/article/view/382
+- distill
+ https://distill.pub/2020/bayesian-optimization/
+ https://distill.pub/2018/feature-wise-transformations/
+- youtube video embed
+ http://www.cond.org/persalog.html
+- youtube video direct?
+- github: project README?
+- wikipedia
+
+## Background Research
+
+- scrapy (?)
+- requests-html: can run javascript
+ => good for metadata extraction?
+- selectolax
+- scrapely: give HTML and extracted text, it builds the parser
+ => good for difficult one-off cases?
+- https://rushter.com/blog/python-fast-html-parser/
+- WET generation from WARC, a la common crawl
+- https://towardsdatascience.com/categorizing-world-wide-web-c130abd9b717
+
+Other random stuff:
+- distilBERT: most BERT accuracy, 0.4 factor latency (faster)?
+ https://medium.com/huggingface/distilbert-8cf3380435b5
+- htmldate: finds "date of publication" for a document
+- adblockparser
+ => good as a filter in HTML ingest
+- w3lib: utility library. unicode conversion; cleanups; etc
+- courlan: clean/normalize/sample large URL lists
+ => https://github.com/adbar/courlan
+
+### Main Text Extraction
+
+Things to try:
+
+- newspaper3k
+ => basic article extraction. lxml
+- trafilatura
+ => TEI-XML output!
+ => looks very promising
+ => falls back to readability and justext
+- python-readability
+ => improved vs newspaper?
+- dragnet
+- eatiht
+- jusText
+- inscriptis
+ => emphasis on shape/readability of text output? compare with lynx
+- Goose3
+ => metadata and article text
+- news-please
+ => very full-featured. build on scrapy, newspaper, readability
+ => can iterate over common crawl?
+- html2text
+ => actually HTML-to-markdown; no or little "boilerplate removal"
+- boilerpipe (Java)
+ boilerpipe3 (wrapper)
+ boilerpy3 (port)
+
+Comparisons and articles:
+
+- https://www.diffbot.com/benefits/comparison/
+- https://github.com/scrapinghub/article-extraction-benchmark
+ - https://github.com/scrapinghub/article-extraction-benchmark/releases/download/v1.0.0/paper-v1.0.0.pdf
+- https://github.com/rundimeco/waddle
+
+- https://moz.com/devblog/benchmarking-python-content-extraction-algorithms-dragnet-readability-goose-and-eatiht
+- https://hal.archives-ouvertes.fr/hal-02768510v3/document (fr; June 2020)
+ https://translate.google.com/translate?sl=auto&tl=en&u=https%3A%2F%2Fhal.archives-ouvertes.fr%2Fhal-02768510v3%2Fdocument
+- http://eprints.fri.uni-lj.si/1718/1/Kovacic-1.pdf (2012)
+- "Generic Web Content Extraction with Open-Source Software" (2020; trafilatura)
+- "Out-of-the-Box and Into the Ditch? Multilingual Evaluation of Generic Text Extraction Tools"
+ https://hal.archives-ouvertes.fr/hal-02732851/document
+ very on-topic
+- https://cloud.google.com/blog/products/gcp/problem-solving-with-ml-automatic-document-classification
+
+### Reference/Citation Extraction
+
+"Locating and parsing bibliographic references in HTML medical articles"
+https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2903768/
+
+cb2bib (in debian/ubuntu)
+
+
+### Metadata Extraction
+
+OJS 3.x seems to have `citation_fulltext_html_url`. Annoyingly, has an iframe.
+
+http://documents.clockss.org/index.php/LOCKSS:_Extracting_Bibliographic_Metadata
+
+https://blog.dshr.org/2013/04/talk-on-lockss-metadata-extraction-at.html
+
+"OXPath": declaritive XPath extension for scraping metadata
+https://journal.code4lib.org/articles/13007
+
+
+## newspaper3k experimentation
+
+ import newspaper
+
+ import nltk
+ nltk.download('punkt')
+
+ # first mondays (OJS) fulltext
+ monday = newspaper.Article("https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729?inline=1")
+ # => ugh, iframe
+ monday.download()
+ monday.parse() # several seconds
+
+ monday.title
+ # Surveillance, stigma and sociotechnical design for HIV
+ monday.text
+ # reasonable; similar to pdftotext?
+ monday.authors
+ # empty
+ monday.images
+ # reasonable?
+
+ nih = newspaper.Article('https://www.nlm.nih.gov/pubs/techbull/ja02/ja02_locatorplus_merge.html')
+ nih.download()
+ nih.parse()
+ nih.nlp()
+
+ nih.title
+ # Migration of Monographic Citations to LocatorPlus: Merge Project. NLM Technical Bulletin. Jul-Aug 2002
+ # duplicate journal name in title
+ nih.authors
+ # none
+ nih.text
+ # Ok. missing first character, weirdly
+
+ genders = newspaper.Article('https://web.archive.org/web/20141230080932id_/http://www.genders.org/g58/g58_fairlie.html')
+ genders.download()
+ genders.parse()
+
+ genders.title
+ # Presenting innovative theories in art, literature, history, music, TV and film.
+ # nope: this is title of the journal
+
+ genders.text
+ # Ok. includes title and author in the body.
+
+ dlib = newspaper.Article('http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html')
+ dlib.download()
+ dlib.parse()
+
+ dlib.title
+ # Transforming Libraries and Archives through Crowdsourcing
+ dlib.authors()
+ # none
+ dlib.text
+ # some other junk, but main body there
+
+## trafilatura experimentation
+
+ trafilatura --json -u 'http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html' | jq .
+
+ trafilatura --xmltei -u 'http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html'
+
+Does not work with `first_monday_ojs_inline`?
+
+May need to test/compare more.
+
+Examples/bugs:
+
+ http://web.archive.org/web/20081120141035id_/http://www.mundanebehavior.org/issues/v5n1/jones.htm
+ poor title detection
+
+ generally, author detection not great.
+ not, apparently, using detection of dc.authors etc
+
+
+## Prod Deployment Notes (2020-12-14)
+
+Created `html_meta` table in `sandcrawler-db`.
+
+Updated ansible roles to deploy persist and import workers. Then ran the roles
+and enabled:
+
+- sandcrawler database (aitio)
+ - sandcrawler-persist-ingest-file-worker@1: restarted
+- blobs (wbgrp-svc169)
+ - sandcrawler-persist-html-teixml-worker@1: started and enabled
+ - sandcrawler-persist-xml-doc-worker@1: started and enabled
+- fatcat prod worker (wbgrp-svc502)
+ - fatcat-import-ingest-web-worker: started and enabled
+
+Test some d-lib and first monday ingests:
+
+ # dlib
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html --limit 50 container --container-id ugbiirfvufgcjkx33r3cmemcuu
+ => Counter({'estimate': 803, 'ingest_request': 50, 'elasticsearch_release': 50, 'kafka': 50})
+
+ # first monday
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html --limit 50 container --container-id svz5ul6qozdjhjhk7d627avuja
+
+Starting:
+
+ d-lib: 253 / 1056 preserved (https://fatcat.wiki/container/ugbiirfvufgcjkx33r3cmemcuu/coverage)
+
+Initially, `fatcat-import-ingest-web-worker` is seeing these but doesn't seem
+to be importing.
+
+ # postgresql shell
+ select sha1hex, updated, status, scope, has_teixml, has_thumbnail, word_count from html_meta;
+ => initially has_teixml is false for all
+ => fixed in an update
+
+ # weed shell
+ > fs.ls /buckets/sandcrawler/html_body
+ [...]
+ > fs.cat /buckets/sandcrawler/html_body/77/75/7775adf8c7e19151bbe887bfa08a575483291d7c.tei.xml
+ [looks like fine TEI-XML]
+
+Going to debug ingest issue by dumping results to disk and importing manually
+(best way to see counts):
+
+ kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -o -10 | rg html | head -n10 | jq . -c > web_ingest_results.json
+
+ export FATCAT_AUTH_WORKER_CRAWL=[...]
+ ./fatcat_import.py ingest-web-results web_ingest_results.json
+ => Counter({'total': 10, 'skip-update-disabled': 9, 'skip': 1, 'skip-hit': 1, 'insert': 0, 'update': 0, 'exists': 0})
+
+ # did some patching (f7a75a01), then re-ran twice and got:
+ => Counter({'total': 10, 'insert': 9, 'skip': 1, 'skip-hit': 1, 'update': 0, 'exists': 0})
+ => Counter({'total': 10, 'exists': 9, 'skip': 1, 'skip-hit': 1, 'insert': 0, 'update': 0})
+
+ # looks good!
+
+Re-ingesting all of d-lib:
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id ugbiirfvufgcjkx33r3cmemcuu
+ => Expecting 803 release objects in search queries
+ => Counter({'ingest_request': 803, 'elasticsearch_release': 803, 'estimate': 803, 'kafka': 803})
+
+TODO:
+
+- release ES transform isn't counting these as `in_ia` or preserved (code-only change)
+- no indication in search results (ES schema change)
+- ingest tool should probably look at `in_ia_html` or `in_ia_pdf` for PDF/XML queries (or a `types_in_ia` list?)
diff --git a/notes/ingest/.gitignore b/notes/ingest/.gitignore
new file mode 100644
index 0000000..343a25c
--- /dev/null
+++ b/notes/ingest/.gitignore
@@ -0,0 +1,2 @@
+*.csv
+*.json
diff --git a/notes/ingest/2019-10-23_testing.md b/notes/ingest/2019-10-23_testing.md
new file mode 100644
index 0000000..481c4e2
--- /dev/null
+++ b/notes/ingest/2019-10-23_testing.md
@@ -0,0 +1,8 @@
+
+exported not-archived DOIs for elife, as well as general list.
+
+ wc -l recent\ missing\ oa\ releases.csv
+ 161828 recent missing oa releases.csv
+
+ wc -l missing\ elife\ DOIs.csv
+ 1779 missing elife DOIs.csv
diff --git a/notes/ingest/2020-01-14_bulk.md b/notes/ingest/2020-01-14_bulk.md
new file mode 100644
index 0000000..9d05cda
--- /dev/null
+++ b/notes/ingest/2020-01-14_bulk.md
@@ -0,0 +1,26 @@
+
+Generate ingest requests from arabesque:
+
+ zcat /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source arxiv --extid-type arxiv --release-stage submitted - | shuf > /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json
+
+ zcat /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source pmc --extid-type pmcid - | shuf > /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json
+
+
+Quick tests locally:
+
+ time head -n100 /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_arxiv.json
+ time head -n100 /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_pubmed.json
+
+These are all wayback success; looking good! Single threaded, from home laptop
+(over tunnel), took about 9 minutes, or 5.5sec/pdf. That's pretty slow even
+with 30x parallelism. Should re-test on actual server. GROBID pre-check should
+help?
+
+With new bulk topic:
+
+ head PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Ok, let them rip:
+
+ cat PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ARXIV-CRAWL-2019-10.arabesque.ingest_request.json | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2020-02-04_ingest_backfills.md b/notes/ingest/2020-02-04_ingest_backfills.md
new file mode 100644
index 0000000..73a42ef
--- /dev/null
+++ b/notes/ingest/2020-02-04_ingest_backfills.md
@@ -0,0 +1,148 @@
+
+
+## Using Fatcat Tool
+
+Want to enqueue some backfill URLs to crawl, now that SPNv2 is on the mend.
+
+Example dry-run:
+
+ ./fatcat_ingest.py --dry-run --limit 50 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name elife
+
+Big OA from 2020 (past month):
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name elife
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 158 release objects in search queries
+ Counter({'ingest_request': 158, 'estimate': 158, 'kafka': 158, 'elasticsearch_release': 158})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name elife
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 2312 release objects in search queries
+ Counter({'kafka': 2312, 'ingest_request': 2312, 'elasticsearch_release': 2312, 'estimate': 2312})
+
+ # note: did 100 first to test
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name plos
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 1185 release objects in search queries
+ Counter({'estimate': 1185, 'ingest_request': 1185, 'elasticsearch_release': 1185, 'kafka': 1185})
+
+ ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher elsevier
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 89 release objects in search queries
+ Counter({'elasticsearch_release': 89, 'estimate': 89, 'ingest_request': 89, 'kafka': 89})
+
+ ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher ieee
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 499 release objects in search queries
+ Counter({'kafka': 499, 'ingest_request': 499, 'estimate': 499, 'elasticsearch_release': 499})
+
+ ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name bmj
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 28 release objects in search queries
+ Counter({'elasticsearch_release': 28, 'ingest_request': 28, 'kafka': 28, 'estimate': 28})
+
+ ./fatcat_ingest.py --dry-run --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher springer
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 6225 release objects in search queries
+ Counter({'estimate': 6225, 'kafka': 500, 'elasticsearch_release': 500, 'ingest_request': 500})
+
+ ./fatcat_ingest.py --limit 1000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 2920 release objects in search queries
+ Counter({'estimate': 2920, 'elasticsearch_release': 1001, 'ingest_request': 1000, 'kafka': 1000})
+
+Hip corona virus papers:
+
+ ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query coronavirus
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 5332 release objects in search queries
+ Counter({'estimate': 5332, 'elasticsearch_release': 2159, 'ingest_request': 2000, 'kafka': 2000})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 2019-nCoV
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 110 release objects in search queries
+ Counter({'ingest_request': 110, 'kafka': 110, 'elasticsearch_release': 110, 'estimate': 110})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query MERS-CoV
+ Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests
+ Expecting 589 release objects in search queries
+ Counter({'estimate': 589, 'elasticsearch_release': 589, 'ingest_request': 552, 'kafka': 552})
+
+
+Mixed eLife results:
+
+ ["wrong-mimetype",null,"https://elifesciences.org/articles/54551"]
+ ["success",null,"https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNTE2OTEvZWxpZmUtNTE2OTEtdjEucGRm/elife-51691-v1.pdf?_hash=Jp1cLog1NzIlU%2BvjgLdbM%2BuphOwe5QWUn%2F97tbQBNG4%3D"]
+
+## Re-Request Failed
+
+Select some failed injest request rows to re-enqueue:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ ) TO '/grande/snapshots/reingest_spn2cdx_20200205.rows.json';
+ -- 1536 rows
+
+Transform back to full requests:
+
+ ./scripts/ingestrequest_row2json.py reingest_spn2cdx_20200205.rows.json > reingest_spn2cdx_20200205.json
+
+Push into kafka (on a kafka broker node):
+
+ cat ~/reingest_spn2cdx_20200205.json | jq . -c | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests -p -1
+
+More:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'error:%'
+ ) TO '/grande/snapshots/reingest_spn2err1_20200205.rows.json';
+ -- COPY 1516
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'spn2-error%'
+ ) TO '/grande/snapshots/reingest_spn2err2_20200205.rows.json';
+ -- COPY 16678
+
+The next large ones to try would be `wayback-error` and `cdx-error`, though
+these are pretty generic. Could go kafka output to try and understand those
+error classes better.
+
+Oof, as a mistake enqueued to partition 1 instead of -1 (random), so these will
+take a week or more to actually process. Re-enqueued as -1; ingesting from
+wayback is pretty fast, this should result mostly wayback ingests. Caught up by
+end of weekend?
+
+## Check Coverages
+
+As follow-ups:
+
+ elife: https://fatcat.wiki/container/en4qj5ijrbf5djxx7p5zzpjyoq/coverage
+ => 2020-02-24: 7187 / 8101 = 88% preserved
+ archivist: https://fatcat.wiki/container/zpobyv4vbranllc7oob56tgci4/coverage
+ => 85 preserved
+ => 2020-02-24: 85 / 3005 preserved (TODO)
+ jcancer: https://fatcat.wiki/container/nkkzpwht7jd3zdftc6gq4eoeey/coverage
+ => 2020 preserved
+ => 2520 preserved
+ => 2020-02-24: 2700 / 2766 preserved
+ plos: https://fatcat.wiki/container/23nqq3odsjhmbi5tqavvcn7cfm/coverage
+ => 2020-02-24: 7580 / 7730 = 98% preserved
+
diff --git a/notes/ingest/2020-02-18_ingest_backfills.md b/notes/ingest/2020-02-18_ingest_backfills.md
new file mode 100644
index 0000000..1ab18f4
--- /dev/null
+++ b/notes/ingest/2020-02-18_ingest_backfills.md
@@ -0,0 +1,42 @@
+
+Select:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'spn2-error%'
+ ) TO '/grande/snapshots/reingest_spn2err_20200218.rows.json';
+ => COPY 6537
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'wayback-error'
+ ) TO '/grande/snapshots/reingest_waybackerr_20200218.rows.json';
+ => COPY 33022
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py reingest_spn2err_20200218.rows.json > reingest_spn2err_20200218.json
+ ./scripts/ingestrequest_row2json.py reingest_waybackerr_20200218.rows.json > reingest_waybackerr_20200218.json
+
+Push to kafka:
+
+ cat reingest_spn2err_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_waybackerr_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+Many had null `ingest_request_source`, so won't actually import into fatcat:
+
+ bnewbold@ia601101$ cat reingest_waybackerr_20200218.json | jq .ingest_request_source | sort | uniq -c | sort -n
+ 1 "savepapernow-web"
+ 112 "fatcat-ingest-container"
+ 11750 "fatcat-changelog"
+ 21159 null
+
diff --git a/notes/ingest/2020-02-21_ingest_backfills.md b/notes/ingest/2020-02-21_ingest_backfills.md
new file mode 100644
index 0000000..48df910
--- /dev/null
+++ b/notes/ingest/2020-02-21_ingest_backfills.md
@@ -0,0 +1,104 @@
+
+Follow-ups to last ingest backfill. Only run these when ingest request topic is
+empty, and full persist chain has run successfully.
+
+## Corona virus stuff
+
+ ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query coronavirus
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 2019-nCoV
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query MERS-CoV
+
+## Large OA Publishers
+
+Should probably check domain stats/success for all of these first.
+
+Would also be good to have a "randomize" option. Could fake that by dumping to
+disk first.
+
+ ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher elsevier
+
+ ./fatcat_ingest.py --dry-run --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher springer
+
+ # ???
+ ./fatcat_ingest.py --limit 1000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4
+
+## Fixed OA Publishers (small tests)
+
+ # american archivist
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4
+ => Expecting 2920 release objects in search queries
+ => Counter({'estimate': 2920, 'elasticsearch_release': 26, 'ingest_request': 25, 'kafka': 25})
+ => good
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter
+ => Expecting 42897 release objects in search queries
+ => Counter({'estimate': 42897, 'ingest_request': 25, 'kafka': 25, 'elasticsearch_release': 25})
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher frontiers
+ => Expecting 35427 release objects in search queries
+ => Counter({'estimate': 35427, 'kafka': 25, 'elasticsearch_release': 25, 'ingest_request': 25})
+ => mixed results?
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi
+ => Expecting 43111 release objects in search queries
+ => Counter({'estimate': 43111, 'elasticsearch_release': 25, 'ingest_request': 25, 'kafka': 25})
+ => success, fast
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher "American Heart Association"
+ => Expecting 185240 release objects in search queries
+ => Counter({'estimate': 185240, 'kafka': 25, 'ingest_request': 25, 'elasticsearch_release': 25})
+ => no success? or mixed? skip for now
+
+ # Environmental Health Perspectives (NIH)
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --container-id 3w6amv3ecja7fa3ext35ndpiky
+ => ["no-pdf-link",null,"https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51"]
+ => ["no-pdf-link",null,"https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51"]
+ => FIXED
+ => good (but slow?)
+
+ ./fatcat_ingest.py --limit 50 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher "Tomsk State University"
+ => Expecting 578057 release objects in search queries
+ => Counter({'estimate': 578057, 'elasticsearch_release': 50, 'kafka': 50, 'ingest_request': 50})
+ => nothing from tsu.ru? skip for now
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name "cogent"
+ => Expecting 4602 release objects in search queries
+ => Counter({'estimate': 4602, 'kafka': 25, 'elasticsearch_release': 25, 'ingest_request': 25})
+ => good
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*"
+ => Expecting 5690 release objects in search queries
+ => Counter({'estimate': 5690, 'ingest_request': 25, 'kafka': 25, 'elasticsearch_release': 25})
+ => good
+
+
+## Fixed OA Publishers (full runs)
+
+ # american archivist
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4
+ Expecting 2920 release objects in search queries
+ Counter({'estimate': 2920, 'elasticsearch_release': 2920, 'kafka': 2911, 'ingest_request': 2911})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter
+ Expecting 42986 release objects in search queries
+ Counter({'estimate': 42986, 'elasticsearch_release': 42986, 'kafka': 42935, 'ingest_request': 42935})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi
+ Expecting 43108 release objects in search queries
+ Counter({'estimate': 43108, 'elasticsearch_release': 43108, 'ingest_request': 41262, 'kafka': 41262})
+
+ # Environmental Health Perspectives (NIH)
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --container-id 3w6amv3ecja7fa3ext35ndpiky
+ Expecting 12699 release objects in search queries
+ Counter({'elasticsearch_release': 12699, 'estimate': 12699, 'kafka': 12615, 'ingest_request': 12615})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name "cogent"
+ Expecting 4602 release objects in search queries
+ Counter({'estimate': 4602, 'ingest_request': 4602, 'kafka': 4602, 'elasticsearch_release': 4602})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*"
+ Expecting 5690 release objects in search queries
+ Counter({'ingest_request': 5690, 'kafka': 5690, 'estimate': 5690, 'elasticsearch_release': 5690})
+
diff --git a/notes/ingest/2020-02-22_fixed_domain.txt b/notes/ingest/2020-02-22_fixed_domain.txt
new file mode 100644
index 0000000..a60de42
--- /dev/null
+++ b/notes/ingest/2020-02-22_fixed_domain.txt
@@ -0,0 +1,246 @@
+
+www.degruyter.com
+
+ "/view/books/" didn't have citation_pdf_url, so added custom URL rule.
+
+ Not sure why redirect-loop happening, but isn't with current live ingest
+ tool?
+
+ domain | status | count
+ -------------------+-------------------------+-------
+ www.degruyter.com | redirect-loop | 22023
+ www.degruyter.com | no-pdf-link | 8773
+ www.degruyter.com | no-capture | 8617
+ www.degruyter.com | success | 840
+ www.degruyter.com | link-loop | 59
+ www.degruyter.com | terminal-bad-status | 23
+ www.degruyter.com | wrong-mimetype | 12
+ www.degruyter.com | spn-error | 4
+ www.degruyter.com | spn2-cdx-lookup-failure | 4
+ www.degruyter.com | spn2-error:proxy-error | 1
+ www.degruyter.com | spn-remote-error | 1
+ www.degruyter.com | gateway-timeout | 1
+ www.degruyter.com | petabox-error | 1
+ (13 rows)
+
+www.frontiersin.org
+
+ no pdf link
+
+ seems to live ingest fine? files served from "*.blob.core.windows.net"
+ no fix, just re-ingest.
+
+ domain | status | count
+ ---------------------+-------------------------+-------
+ www.frontiersin.org | no-pdf-link | 17503
+ www.frontiersin.org | terminal-bad-status | 6696
+ www.frontiersin.org | wayback-error | 203
+ www.frontiersin.org | no-capture | 20
+ www.frontiersin.org | spn-error | 6
+ www.frontiersin.org | gateway-timeout | 3
+ www.frontiersin.org | wrong-mimetype | 3
+ www.frontiersin.org | spn2-cdx-lookup-failure | 2
+ www.frontiersin.org | spn2-error:job-failed | 2
+ www.frontiersin.org | spn-remote-error | 1
+ www.frontiersin.org | cdx-error | 1
+ (11 rows)
+
+www.mdpi.com
+
+ terminal-bad-status
+
+ Seems to ingest fine live? No fix, just re-ingest.
+
+ domain | status | count
+ --------------+-------------------------+-------
+ www.mdpi.com | terminal-bad-status | 13866
+ www.mdpi.com | wrong-mimetype | 2693
+ www.mdpi.com | wayback-error | 513
+ www.mdpi.com | redirect-loop | 505
+ www.mdpi.com | success | 436
+ www.mdpi.com | no-capture | 214
+ www.mdpi.com | no-pdf-link | 43
+ www.mdpi.com | spn2-cdx-lookup-failure | 34
+ www.mdpi.com | gateway-timeout | 3
+ www.mdpi.com | petabox-error | 2
+ (10 rows)
+
+www.ahajournals.org | no-pdf-link | 5727
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'www.ahajournals.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%www.ahajournals.org%'
+ AND status = 'no-pdf-link'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ ---------------------+----------------+-------
+ www.ahajournals.org | no-pdf-link | 5738
+ www.ahajournals.org | wrong-mimetype | 84
+ (2 rows)
+
+
+ pdf | https://doi.org/10.1161/circ.110.19.2977 | 2020-02-23 00:28:55.256296+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 |
+ pdf | https://doi.org/10.1161/str.49.suppl_1.tp403 | 2020-02-23 00:27:34.950059+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 |
+ pdf | https://doi.org/10.1161/str.49.suppl_1.tp168 | 2020-02-23 00:25:54.611271+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 |
+ pdf | https://doi.org/10.1161/jaha.119.012131 | 2020-02-23 00:24:44.244511+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 |
+
+ Ah, the ol' annoying 'cookieAbsent'. Works with live SPNv2 via soft-404
+ detection, but that status wasn't coming through, and needed custom
+ pdf-link detection.
+
+ FIXED: added pdf-link detection
+
+ehp.niehs.nih.gov | no-pdf-link | 5772
+
+ simple custom URL format. but are they also blocking?
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'ehp.niehs.nih.gov'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ domain | status | count
+ -------------------+----------------+-------
+ ehp.niehs.nih.gov | no-pdf-link | 5791
+ ehp.niehs.nih.gov | wrong-mimetype | 11
+ (2 rows)
+
+ FIXED: mostly just slow, custom URL seems to work
+
+journals.tsu.ru | no-pdf-link | 4404
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'journals.tsu.ru'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%journals.tsu.ru%'
+ AND status = 'no-pdf-link'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ -----------------+----------------+-------
+ journals.tsu.ru | no-pdf-link | 4409
+ journals.tsu.ru | success | 1
+ journals.tsu.ru | wrong-mimetype | 1
+ (3 rows)
+
+
+ pdf | https://doi.org/10.17223/18572685/57/3 | 2020-02-23 00:45:49.003593+00 | f | no-pdf-link | http://journals.tsu.ru/rusin/&journal_page=archive&id=1907&article_id=42847 | 20200213132322 | 200 |
+ pdf | https://doi.org/10.17223/17267080/71/4 | 2020-02-23 00:31:25.715416+00 | f | no-pdf-link | http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405 | 20200211151825 | 200 |
+ pdf | https://doi.org/10.17223/15617793/399/33 | 2020-02-23 00:29:45.414865+00 | f | no-pdf-link | http://journals.tsu.ru/vestnik/&journal_page=archive&id=1322&article_id=24619 | 20200208152715 | 200 |
+ pdf | https://doi.org/10.17223/19988613/58/15 | 2020-02-23 00:25:24.402838+00 | f | no-pdf-link | http://journals.tsu.ru//history/&journal_page=archive&id=1827&article_id=40501 | 20200212200320 | 200 |
+
+ FIXED: simple new custom PDF link pattern
+
+www.cogentoa.com | no-pdf-link | 4282
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'www.cogentoa.com'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%www.cogentoa.com%'
+ AND status = 'no-pdf-link'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ ------------------+-------------+-------
+ www.cogentoa.com | no-pdf-link | 4296
+ (1 row)
+
+ pdf | https://doi.org/10.1080/23311932.2015.1022632 | 2020-02-23 01:06:14.040013+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23311932.2015.1022632 | 20200208054228 | 200 |
+ pdf | https://doi.org/10.1080/23322039.2020.1730079 | 2020-02-23 01:04:53.754117+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23322039.2020.1730079 | 20200223010431 | 200 |
+ pdf | https://doi.org/10.1080/2331186x.2018.1460901 | 2020-02-23 01:04:03.47563+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/2331186X.2018.1460901 | 20200207200958 | 200 |
+ pdf | https://doi.org/10.1080/23311975.2017.1412873 | 2020-02-23 01:03:08.063545+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23311975.2017.1412873 | 20200209034602 | 200 |
+ pdf | https://doi.org/10.1080/23311916.2017.1293481 | 2020-02-23 01:02:42.868424+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23311916.2017.1293481 | 20200208101623 | 200 |
+
+ FIXED: simple custom URL-based pattern
+
+chemrxiv.org | no-pdf-link | 4186
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'chemrxiv.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%chemrxiv.org%'
+ AND status = 'no-pdf-link'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ --------------+-------------------------+-------
+ chemrxiv.org | no-pdf-link | 4202
+ chemrxiv.org | wrong-mimetype | 64
+ chemrxiv.org | wayback-error | 14
+ chemrxiv.org | success | 12
+ chemrxiv.org | terminal-bad-status | 4
+ chemrxiv.org | spn2-cdx-lookup-failure | 1
+
+ pdf | https://doi.org/10.26434/chemrxiv.9912812.v1 | 2020-02-23 01:08:34.585084+00 | f | no-pdf-link | https://chemrxiv.org/articles/Proximity_Effect_in_Crystalline_Framework_Materials_Stacking-Induced_Functionality_in_MOFs_and_COFs/9912812/1 | 20200215072929 | 200 |
+ pdf | https://doi.org/10.26434/chemrxiv.7150097 | 2020-02-23 01:05:48.957624+00 | f | no-pdf-link | https://chemrxiv.org/articles/Systematic_Engineering_of_a_Protein_Nanocage_for_High-Yield_Site-Specific_Modification/7150097 | 20200213002430 | 200 |
+ pdf | https://doi.org/10.26434/chemrxiv.7833500.v1 | 2020-02-23 00:55:41.013109+00 | f | no-pdf-link | https://chemrxiv.org/articles/Formation_of_Neutral_Peptide_Aggregates_Studied_by_Mass_Selective_IR_Action_Spectroscopy/7833500/1 | 20200210131343 | 200 |
+ pdf | https://doi.org/10.26434/chemrxiv.8146103 | 2020-02-23 00:52:00.193328+00 | f | no-pdf-link | https://chemrxiv.org/articles/On-Demand_Guest_Release_from_MOF-5_Sealed_with_Nitrophenylacetic_Acid_Photocapping_Groups/8146103 | 20200207215449 | 200 |
+ pdf | https://doi.org/10.26434/chemrxiv.10101419 | 2020-02-23 00:46:14.086913+00 | f | no-pdf-link | https://chemrxiv.org/articles/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives_The_Hidden_Nature_of_Dasatinib/10101419 | 20200214044153 | 200 |
+
+ FIXED: complex JSON PDF url extraction; maybe for all figshare?
+
+TODO:
+x many datacite prefixes go to IRs, but have is_oa:false. we should probably crawl by default based on release_type
+ => fatcat branch bnewbold-more-ingest
+- re-ingest all degruyter (doi_prefix:10.1515)
+ 1456169 doi:10.1515\/*
+ 89942 doi:10.1515\/* is_oa:true
+ 36350 doi:10.1515\/* in_ia:false is_oa:true
+ 1290830 publisher:Gruyter
+ 88944 publisher:Gruyter is_oa:true
+ 40034 publisher:Gruyter is_oa:true in_ia:false
+- re-ingest all frontiersin
+ 248165 publisher:frontiers
+ 161996 publisher:frontiers is_oa:true
+ 36093 publisher:frontiers is_oa:true in_ia:false
+ 121001 publisher:frontiers in_ia:false
+- re-ingest all mdpi
+ 43114 publisher:mdpi is_oa:true in_ia:false
+- re-ingest all ahajournals.org
+ 132000 doi:10.1161\/*
+ 6606 doi:10.1161\/* in_ia:false is_oa:true
+ 81349 publisher:"American Heart Association"
+ 5986 publisher:"American Heart Association" is_oa:true in_ia:false
+- re-ingest all ehp.niehs.nih.gov
+ 25522 doi:10.1289\/*
+ 15315 publisher:"Environmental Health Perspectives"
+ 8779 publisher:"Environmental Health Perspectives" in_ia:false
+ 12707 container_id:3w6amv3ecja7fa3ext35ndpiky in_ia:false is_oa:true
+- re-ingest all journals.tsu.ru
+ 12232 publisher:"Tomsk State University"
+ 11668 doi:10.17223\/*
+ 4861 publisher:"Tomsk State University" in_ia:false is_oa:true
+- re-ingest all www.cogentoa.com
+ 3421898 doi:10.1080\/*
+ 4602 journal:cogent is_oa:true in_ia:false
+ 5631 journal:cogent is_oa:true (let's recrawl all from publisher domain)
+- re-ingest chemrxiv
+ 8281 doi:10.26434\/chemrxiv*
+ 6918 doi:10.26434\/chemrxiv* in_ia:false
+
+Submit all the above with limits of 1000, then follow up later to check that
+there was success?
+
diff --git a/notes/ingest/2020-02_unpaywall.md b/notes/ingest/2020-02_unpaywall.md
new file mode 100644
index 0000000..e18a2ff
--- /dev/null
+++ b/notes/ingest/2020-02_unpaywall.md
@@ -0,0 +1,624 @@
+
+## Stats and Things
+
+ zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | jq .oa_locations[].url_for_pdf -r | rg -v ^null | cut -f3 -d/ | sort | uniq -c | sort -nr > top_domains.txt
+
+## Transform
+
+ zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | ./unpaywall2ingestrequest.py - | pv -l > /dev/null
+ => 22M 1:31:25 [ 4k/s]
+
+Shard it into batches of roughly 1 million (all are 1098096 +/- 1):
+
+ zcat unpaywall_snapshot_2019-11-22.ingest_request.shuf.json.gz | split -n r/20 -d - unpaywall_snapshot_2019-11-22.ingest_request.split_ --additional-suffix=.json
+
+Test ingest:
+
+ head -n200 unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Add a single batch like:
+
+ cat unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Progress/Status
+
+There are 21,961,928 lines total, in batches of 1,098,097.
+
+ unpaywall_snapshot_2019-11-22.ingest_request.split_00.json
+ => 2020-02-24 21:05 local: 1,097,523 ~22 results/sec (combined)
+ => 2020-02-25 10:35 local: 0
+ unpaywall_snapshot_2019-11-22.ingest_request.split_01.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_02.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_03.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_04.json
+ => 2020-02-25 11:26 local: 4,388,997
+ => 2020-02-25 10:14 local: 1,115,821
+ => 2020-02-26 16:00 local: 265,116
+ unpaywall_snapshot_2019-11-22.ingest_request.split_05.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_06.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_07.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_08.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_09.json
+ => 2020-02-26 16:01 local: 6,843,708
+ => 2020-02-26 16:31 local: 4,839,618
+ => 2020-02-28 10:30 local: 2,619,319
+ unpaywall_snapshot_2019-11-22.ingest_request.split_10.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_11.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_12.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_13.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_14.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_15.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_16.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_17.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_18.json
+ unpaywall_snapshot_2019-11-22.ingest_request.split_19.json
+ => 2020-02-28 10:50 local: 13,551,887
+ => 2020-03-01 23:38 local: 4,521,076
+ => 2020-03-02 10:45 local: 2,827,071
+ => 2020-03-02 21:06 local: 1,257,176
+ added about 500k bulk re-ingest to try and work around cdx errors
+ => 2020-03-02 21:30 local: 1,733,654
+
+## Investigate Failures
+
+Guessing than some domains are ultimately going to need direct "recrawl" via
+SPNv2.
+
+ -- top domain failures for unpaywall GWB history ingest
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ -----------------------------------+---------------------+--------
+ watermark.silverchair.com | terminal-bad-status | 258432
+ www.tandfonline.com | no-pdf-link | 203873
+ journals.sagepub.com | no-pdf-link | 126317
+ iopscience.iop.org | terminal-bad-status | 112526
+ files-journal-api.frontiersin.org | terminal-bad-status | 112499
+ pubs.acs.org | no-pdf-link | 94772
+ www.degruyter.com | redirect-loop | 89801
+ www.ahajournals.org | no-pdf-link | 84025
+ society.kisti.re.kr | no-pdf-link | 72849
+ www.nature.com | redirect-loop | 53575
+ babel.hathitrust.org | terminal-bad-status | 41063
+ www.ncbi.nlm.nih.gov | redirect-loop | 40363
+ scialert.net | no-pdf-link | 38340
+ www.degruyter.com | terminal-bad-status | 34913
+ www.journal.csj.jp | no-pdf-link | 30881
+ espace.library.uq.edu.au | redirect-loop | 24570
+ www.jci.org | redirect-loop | 24409
+ aip.scitation.org | wrong-mimetype | 22144
+ www.vr-elibrary.de | no-pdf-link | 17436
+ www.biorxiv.org | wrong-mimetype | 15524
+ ajph.aphapublications.org | no-pdf-link | 15083
+ zookeys.pensoft.net | redirect-loop | 14867
+ dialnet.unirioja.es | redirect-loop | 14486
+ asa.scitation.org | wrong-mimetype | 14261
+ www.nrcresearchpress.com | no-pdf-link | 14254
+ dl.acm.org | redirect-loop | 14223
+ osf.io | redirect-loop | 14103
+ www.oecd-ilibrary.org | redirect-loop | 12835
+ journals.sagepub.com | redirect-loop | 12229
+ iopscience.iop.org | redirect-loop | 11825
+ (30 rows)
+
+ -- top no-capture terminal domains
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ => very few from any domain, interesting. Guess many of these are URLs that have truely never been crawled
+
+ -- top no-capture base domains
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ------------------------------+------------+--------
+ academic.oup.com | no-capture | 429888
+ www.nature.com | no-capture | 273825
+ dergipark.org.tr | no-capture | 119847
+ www.biodiversitylibrary.org | no-capture | 110220
+ escholarship.org | no-capture | 106307
+ onlinelibrary.wiley.com | no-capture | 89771
+ journals.sagepub.com | no-capture | 79297
+ www.cell.com | no-capture | 64242
+ deepblue.lib.umich.edu | no-capture | 58080
+ babel.hathitrust.org | no-capture | 52286
+ hal.archives-ouvertes.fr | no-capture | 48549
+ iopscience.iop.org | no-capture | 42591
+ dash.harvard.edu | no-capture | 40767
+ www.tandfonline.com | no-capture | 40638
+ discovery.ucl.ac.uk | no-capture | 40633
+ www.jstage.jst.go.jp | no-capture | 39780
+ www.doiserbia.nb.rs | no-capture | 39261
+ dspace.mit.edu | no-capture | 37703
+ zookeys.pensoft.net | no-capture | 34562
+ repositorio.unesp.br | no-capture | 34437
+ ashpublications.org | no-capture | 34112
+ www.cambridge.org | no-capture | 33959
+ kclpure.kcl.ac.uk | no-capture | 31455
+ society.kisti.re.kr | no-capture | 30427
+ pure.mpg.de | no-capture | 27650
+ download.atlantis-press.com | no-capture | 27253
+ dialnet.unirioja.es | no-capture | 26886
+ link.springer.com | no-capture | 26257
+ www.valueinhealthjournal.com | no-capture | 24798
+ dspace.library.uu.nl | no-capture | 23234
+ (30 rows)
+
+ -- top no-capture base domains
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ------------------------------+------------+--------
+ academic.oup.com | no-capture | 429888
+ www.nature.com | no-capture | 273825
+ dergipark.org.tr | no-capture | 119847
+ www.biodiversitylibrary.org | no-capture | 110220
+ escholarship.org | no-capture | 106307
+ onlinelibrary.wiley.com | no-capture | 89771
+ journals.sagepub.com | no-capture | 79297
+ www.cell.com | no-capture | 64242
+ deepblue.lib.umich.edu | no-capture | 58080
+ babel.hathitrust.org | no-capture | 52286
+ hal.archives-ouvertes.fr | no-capture | 48549
+ iopscience.iop.org | no-capture | 42591
+ dash.harvard.edu | no-capture | 40767
+ www.tandfonline.com | no-capture | 40638
+ discovery.ucl.ac.uk | no-capture | 40633
+ www.jstage.jst.go.jp | no-capture | 39780
+ www.doiserbia.nb.rs | no-capture | 39261
+ dspace.mit.edu | no-capture | 37703
+ zookeys.pensoft.net | no-capture | 34562
+ repositorio.unesp.br | no-capture | 34437
+ ashpublications.org | no-capture | 34112
+ www.cambridge.org | no-capture | 33959
+ kclpure.kcl.ac.uk | no-capture | 31455
+ society.kisti.re.kr | no-capture | 30427
+ pure.mpg.de | no-capture | 27650
+ download.atlantis-press.com | no-capture | 27253
+ dialnet.unirioja.es | no-capture | 26886
+ link.springer.com | no-capture | 26257
+ www.valueinhealthjournal.com | no-capture | 24798
+ dspace.library.uu.nl | no-capture | 23234
+ (30 rows)
+
+ -- how many ingest requests not crawled at all?
+ SELECT count(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status IS NULL;
+ => 0
+
+ -- "cookie absent" terminal pages, by domain
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ --------------------------------+----------------+--------
+ journals.sagepub.com | no-pdf-link | 126295
+ www.tandfonline.com | no-pdf-link | 116690
+ pubs.acs.org | no-pdf-link | 94619
+ www.ahajournals.org | no-pdf-link | 84016
+ www.journal.csj.jp | no-pdf-link | 30881
+ aip.scitation.org | wrong-mimetype | 22143
+ www.vr-elibrary.de | no-pdf-link | 17436
+ ajph.aphapublications.org | no-pdf-link | 15080
+ asa.scitation.org | wrong-mimetype | 14261
+ www.nrcresearchpress.com | no-pdf-link | 14253
+ journals.ametsoc.org | no-pdf-link | 10500
+ www.journals.uchicago.edu | no-pdf-link | 6917
+ www.icevirtuallibrary.com | no-pdf-link | 6484
+ www.journals.uchicago.edu | wrong-mimetype | 6191
+ www.healthaffairs.org | no-pdf-link | 5732
+ pubsonline.informs.org | no-pdf-link | 5672
+ pinnacle-secure.allenpress.com | no-pdf-link | 5013
+ www.worldscientific.com | no-pdf-link | 4560
+ www.ajronline.org | wrong-mimetype | 4523
+ ehp.niehs.nih.gov | no-pdf-link | 4514
+ www.future-science.com | no-pdf-link | 4091
+ pubs.acs.org | wrong-mimetype | 4015
+ aip.scitation.org | no-pdf-link | 3916
+ www.futuremedicine.com | no-pdf-link | 3821
+ asa.scitation.org | no-pdf-link | 3644
+ www.liebertpub.com | no-pdf-link | 3345
+ physicstoday.scitation.org | no-pdf-link | 3005
+ pubs.cif-ifc.org | no-pdf-link | 2761
+ epubs.siam.org | wrong-mimetype | 2583
+ www.ajronline.org | no-pdf-link | 2563
+ (30 rows)
+
+ -- "cookie absent" terminal pages, by domain
+ SELECT count(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url LIKE '%/cookieAbsent';
+
+ => 654885
+
+ -- NOT "cookie absent" terminal page failures, total count
+ SELECT count(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent';
+
+ => 1403837
+
+Looks like these domains are almost all "cookieAbsent" blocking:
+- journals.sagepub.com
+- pubs.acs.org
+- ahajournals.org
+- www.journal.csj.jp
+- aip.scitation.org
+
+Grab some individual URLs to test:
+
+ SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+ ORDER BY updated DESC
+ LIMIT 25;
+
+NOT cookieAbsent testing with regular ingest tool:
+- iopscience.iop.org, terminal-bad-status, SPNv2 fetch, success
+- academic.oup.com => silverchair, terminal-bad-status, SPNv2 fetch, succes
+- osf.io success
+
+ SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+ ORDER BY updated DESC
+ LIMIT 25;
+
+cookieAbsent testing with regular ingest tool:
+- www.tandfonline.com failure (no-pdf-link via wayback), but force-recrawl works
+
+The main distinguisher is status. terminal-bad-status can be ingested (live)
+successfully, while no-pdf-link, redirect-loop, etc need to be re-crawled.
+
+## Heritrix Plan
+
+Generate following ingest request batches:
+
+- no-capture status from unpaywall
+- all other failures except /cookieAbsent
+- /cookieAbsent failures
+
+Plan will be to crawl no-capture first (to completion), then try the other
+non-/cookieAbsent failures. /cookieAbsent means we'll need to use SPNv2.
+
+Because there are so few "no-capture on second hop" cases, will not enqueue
+both terminal urls and base urls, only base urls.
+
+Should definitely skip/filter:
+
+- www.ncbi.nlm.nih.gov
+
+## Ingest Request Export
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status = 'no-capture'
+ ) TO '/grande/snapshots/unpaywall_nocapture_20200304.rows.json';
+ => 4,855,142
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+ ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json';
+ => 1,403,837
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200304.rows.json > unpaywall_nocapture_20200304.json
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json > unpaywall_fail_nocookie_20200304.json
+
+Note: will probably end up re-running the below after crawling+ingesting the above:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.status = 'terminal-bad-status'
+ AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+ ) TO '/grande/snapshots/unpaywall_fail_cookie_badstatus_20200304.rows.json';
+ => 0
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.status != 'terminal-bad-status'
+ AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'
+ ) TO '/grande/snapshots/unpaywall_fail_cookie_other_20200304.rows.json';
+ => 654,885
+
+## Batch Ingest
+
+Test small batch:
+
+ head -n200 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full batch:
+
+ cat /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ # there was a broken line in there, so...
+ # parse error: Expected separator between values at line 1367873, column 175
+ # tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c > /dev/null
+ tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Note that the crawl is not entirely complete and not all CDX seem to have been
+loaded, so may need to iterate. About 10% are still "no capture". May want or
+need to additionally crawl the terminal URLs, not the base URLs.
+
+## Post-ingest stats
+
+Overall status:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 17354494
+ no-pdf-link | 1471076
+ no-capture | 1135992
+ redirect-loop | 837842
+ terminal-bad-status | 803081
+ cdx-error | 219746
+ wrong-mimetype | 100723
+ link-loop | 16013
+ wayback-error | 12448
+ null-body | 9444
+ redirects-exceeded | 600
+ petabox-error | 411
+ bad-redirect | 17
+ bad-gzip-encoding | 4
+ spn2-cdx-lookup-failure | 3
+ gateway-timeout | 1
+ spn2-error:job-failed | 1
+ spn2-error | 1
+ (18 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ -----------------------------------+---------------------+--------
+ academic.oup.com | no-pdf-link | 330211
+ watermark.silverchair.com | terminal-bad-status | 324599
+ www.tandfonline.com | no-pdf-link | 242724
+ journals.sagepub.com | no-pdf-link | 202050
+ iopscience.iop.org | terminal-bad-status | 144063
+ files-journal-api.frontiersin.org | terminal-bad-status | 121719
+ pubs.acs.org | no-pdf-link | 104535
+ www.ahajournals.org | no-pdf-link | 102653
+ society.kisti.re.kr | no-pdf-link | 101787
+ www.degruyter.com | redirect-loop | 95130
+ www.nature.com | redirect-loop | 87534
+ onlinelibrary.wiley.com | no-pdf-link | 84432
+ www.cell.com | redirect-loop | 61496
+ www.degruyter.com | terminal-bad-status | 42919
+ babel.hathitrust.org | terminal-bad-status | 41813
+ www.ncbi.nlm.nih.gov | redirect-loop | 40488
+ scialert.net | no-pdf-link | 38341
+ ashpublications.org | no-pdf-link | 34889
+ dialnet.unirioja.es | terminal-bad-status | 32076
+ www.journal.csj.jp | no-pdf-link | 30881
+ pure.mpg.de | redirect-loop | 26163
+ www.jci.org | redirect-loop | 24701
+ espace.library.uq.edu.au | redirect-loop | 24591
+ www.valueinhealthjournal.com | redirect-loop | 23740
+ www.vr-elibrary.de | no-pdf-link | 23332
+ aip.scitation.org | wrong-mimetype | 22144
+ osf.io | redirect-loop | 18513
+ www.journals.elsevier.com | no-pdf-link | 16710
+ www.spandidos-publications.com | redirect-loop | 15711
+ www.biorxiv.org | wrong-mimetype | 15513
+ (30 rows)
+
+Dump lists for another iteration of bulk ingest:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status = 'no-capture'
+ ) TO '/grande/snapshots/unpaywall_nocapture_20200323.rows.json';
+ => 278,876
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND ingest_file_result.status != 'success'
+ AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'
+ ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200323.rows.json';
+ =>
+
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200323.rows.json > unpaywall_nocapture_20200323.json
+
+ cat unpaywall_nocapture_20200323.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2020-03-02_ingests.txt b/notes/ingest/2020-03-02_ingests.txt
new file mode 100644
index 0000000..e98ef33
--- /dev/null
+++ b/notes/ingest/2020-03-02_ingests.txt
@@ -0,0 +1,174 @@
+
+## protocols.io
+
+Tested that single ingest is working, and they fixed PDF format on their end
+recently.
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --name protocols.io
+ => Expecting 8448 release objects in search queries
+ => Counter({'estimate': 8448, 'kafka': 8448, 'ingest_request': 8448, 'elasticsearch_release': 8448})
+
+## backfill follow-ups
+
+- re-ingest all degruyter (doi_prefix:10.1515)
+ 89942 doi:10.1515\/* is_oa:true
+ 36350 doi:10.1515\/* in_ia:false is_oa:true
+ 40034 publisher:Gruyter is_oa:true in_ia:false
+ => update:
+ 135926 doi:10.1515\/* is_oa:true
+ 50544 doi:10.1515\/* in_ia:false is_oa:true
+ 54880 publisher:Gruyter is_oa:true in_ia:false
+- re-ingest all frontiersin
+ 36093 publisher:frontiers is_oa:true in_ia:false
+ => update
+ 22444 publisher:frontiers is_oa:true in_ia:false
+ 22029 doi_prefix:10.3389 is_oa:true in_ia:false
+
+ select status, count(*) from ingest_file_result where base_url like 'https://doi.org/10.3389/%' group by status order by count(*) desc;
+
+ status | count
+ -------------------------------------+-------
+ success | 34721
+ no-pdf-link | 18157
+ terminal-bad-status | 6799
+ cdx-error | 1805
+ wayback-error | 333
+ no-capture | 301
+ [...]
+
+ select * from ingest_file_result where base_url like 'https://doi.org/10.17723/aarc%' and status = 'no-pdf-link' order by updated desc limit 100;
+
+- re-ingest all mdpi
+ 43114 publisher:mdpi is_oa:true in_ia:false
+ => update
+ 8548 publisher:mdpi is_oa:true in_ia:false
+
+ select status, count(*) from ingest_file_result where base_url like 'https://doi.org/10.3390/%' group by status order by count(*) desc;
+ status | count
+ -------------------------------------+--------
+ success | 108971
+ cdx-error | 6655
+ wrong-mimetype | 3359
+ terminal-bad-status | 1299
+ wayback-error | 151
+ spn2-cdx-lookup-failure | 87
+
+ => added hack for gzip content-encoding coming through pdf fetch
+ => will re-ingest all after pushing fix
+
+- re-ingest all ahajournals.org
+ 132000 doi:10.1161\/*
+ 6606 doi:10.1161\/* in_ia:false is_oa:true
+ 81349 publisher:"American Heart Association"
+ 5986 publisher:"American Heart Association" is_oa:true in_ia:false
+ => update
+ 1337 publisher:"American Heart Association" is_oa:true in_ia:false
+
+ status | count
+ -------------------------------------+-------
+ success | 1480
+ cdx-error | 1176
+ spn2-cdx-lookup-failure | 514
+ no-pdf-link | 85
+ wayback-error | 25
+ spn2-error:job-failed | 18
+
+ => will re-run errors
+- re-ingest all ehp.niehs.nih.gov
+ 25522 doi:10.1289\/*
+ 15315 publisher:"Environmental Health Perspectives"
+ 8779 publisher:"Environmental Health Perspectives" in_ia:false
+ 12707 container_id:3w6amv3ecja7fa3ext35ndpiky in_ia:false is_oa:true
+ => update
+ 7547 container_id:3w6amv3ecja7fa3ext35ndpiky in_ia:false is_oa:true
+- re-ingest all journals.tsu.ru
+ 12232 publisher:"Tomsk State University"
+ 11668 doi:10.17223\/*
+ 4861 publisher:"Tomsk State University" in_ia:false is_oa:true
+ => update
+ 2605 publisher:"Tomsk State University" in_ia:false is_oa:true
+ => just need to retry these? seem fine
+- re-ingest all www.cogentoa.com
+ 3421898 doi:10.1080\/*
+ 4602 journal:cogent is_oa:true in_ia:false
+ 5631 journal:cogent is_oa:true (let's recrawl all from publisher domain)
+ => update
+ 254 journal:cogent is_oa:true in_ia:false
+- re-ingest chemrxiv
+ 8281 doi:10.26434\/chemrxiv*
+ 6918 doi:10.26434\/chemrxiv* in_ia:false
+ => update
+ 4890 doi:10.26434\/chemrxiv* in_ia:false
+ => re-ingest
+ => allow non-OA
+
+ # american archivist
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4
+ Counter({'estimate': 2920, 'elasticsearch_release': 2920, 'kafka': 2911, 'ingest_request': 2911})
+ => 2020-02-04: 85 / 3,005
+ => 2020-03-02: 2,182 / 3,005 preserved. some no-pdf-link, otherwise just a bunch of spn2-error
+ => looks like the no-pdf-url due to pinnacle-secure.allenpress.com soft-blocking loop
+
+
+## backfill re-ingests
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa --force-recrawl container --container-id zpobyv4vbranllc7oob56tgci4
+ => Counter({'elasticsearch_release': 823, 'estimate': 823, 'ingest_request': 814, 'kafka': 814})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter
+ => Counter({'elasticsearch_release': 54880, 'estimate': 54880, 'kafka': 51497, 'ingest_request': 51497})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query 'publisher:"Tomsk State University"'
+ => Counter({'ingest_request': 2605, 'kafka': 2605, 'elasticsearch_release': 2605, 'estimate': 2605})
+
+ ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*"
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi
+ => Counter({'estimate': 8548, 'elasticsearch_release': 8548, 'ingest_request': 6693, 'kafka': 6693})
+ => NOTE: about 2k not enqueued
+
+## re-ingest all broken
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '1 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'spn2-%'
+ ) TO '/grande/snapshots/reingest_spn2_20200302.rows.json';
+ => COPY 14849
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'cdx-error'
+ ) TO '/grande/snapshots/reingest_cdxerr_20200302.rows.json';
+ => COPY 507610
+
+ This is a huge number! Re-ingest via bulk?
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_spn2_20200302.rows.json > reingest_spn2_20200302.json
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdxerr_20200302.rows.json > reingest_cdxerr_20200302.json
+
+Push to kafka:
+
+ cat reingest_spn2err_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ # accidentially also piped the above through ingest-file-requests-bulk...
+ # which could actually be bad
+ cat reingest_cdxerr_20200302.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## biorxiv/medrxiv
+
+ 8026 doi:10.1101\/20*
+ 2159 doi:10.1101\/20* in_ia:false
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 'doi:10.1101\/20* in_ia:false'
+ => Counter({'estimate': 2159, 'ingest_request': 2159, 'elasticsearch_release': 2159, 'kafka': 2159})
+
diff --git a/notes/ingest/2020-03-oa_but_not_marked.md b/notes/ingest/2020-03-oa_but_not_marked.md
new file mode 100644
index 0000000..73396bd
--- /dev/null
+++ b/notes/ingest/2020-03-oa_but_not_marked.md
@@ -0,0 +1,25 @@
+
+These are large journals with a high fraction of "in IA", but not marked as OA
+so not crawling regularly.
+
+TODO: add things like list of unpaywall ISSN / OA status to try and find more
+"practical" / bronze OA
+
+## First Run
+
+https://fatcat.wiki/container/vmv647omwrhzzgeclyrnpc4him
+https://fatcat.wiki/container/waxwzq3cnbet3cmwccpuk4bel4
+https://fatcat.wiki/container/hjoli2j6qffdpaalkszryuidk4
+https://fatcat.wiki/container/fci57bxfsffvzllbssocnfsr3e
+https://fatcat.wiki/container/hd23c57sunhcnar5fbgxsn36lm
+https://fatcat.wiki/container/bliguyxhonfb7ghuykxgtg3oqe
+
+## TODO
+
+https://fatcat.wiki/container/kn6dhptylrb77b5atyiom5ysjm no-pdf-link (but accessible)
+https://fatcat.wiki/container/s7bticdwizdmhll4taefg57jde no-pdf-link (easy?)
+
+https://fatcat.wiki/container/zm56axre7rgihh5sznxp65np5i large; no-pdf-link?
+https://fatcat.wiki/container/eb2lcnpf2zeezkmfckcvxw2pgi huge (20k+), not all OA?
+https://fatcat.wiki/container/adgy773dtra3xmrsynghcednqm broken?
+https://fatcat.wiki/container/w3gj5mynrnbtndalcc5jnhymym not OA? link-loop
diff --git a/notes/ingest/2020-03_mag.md b/notes/ingest/2020-03_mag.md
new file mode 100644
index 0000000..428ce05
--- /dev/null
+++ b/notes/ingest/2020-03_mag.md
@@ -0,0 +1,576 @@
+
+Rough plan:
+
+- run bulk and/or regular ingest requests for just those of AIT partners (200k?)
+- persist ingest requests (22 million or so)
+- run bulk ingest over 'no status' / 'no match' requests (aka, those not in unpaywall)
+- crawl those which are no-capture
+
+
+## Generate Requests
+
+Newer version of `mag_ingest_request.sh` script requires venv with urlcanon
+installed.
+
+Starting with the 2020-01-23 MAG dump, will generate a full ingest request set
+(including DOI `ext_id` when available), with any dominant domains removed (eg,
+arxiv.org):
+
+ export LC_ALL=C
+ cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 | pv -l > ingest_requests_mag-2020-01-23.doi.json
+ => previously 25.6M
+ => 25.6M 2:29:43 [2.85k/s]
+
+ export LC_ALL=C
+ zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 --pmid | pv -l > ingest_requests_mag-2020-01-23.pmid.json
+ => 4.3M 0:25:45 [2.78k/s]
+
+ export LC_ALL=C
+ cat ingest_requests_mag-2020-01-23.json | jq -r "[.base_url, .ext_ids.doi] | @tsv" | sort -u -S 4G > ingest_requests_mag-2020-01-23.full.seed_id
+
+ zcat PaperUrls_PaperExtendedAttributes_pdf.txt.gz | wc -l
+ => 6,504,907
+
+ zcat PaperUrls_mag_url_pmid.txt.gz | wc -l
+ => 4,369,832
+
+ cat ingest_requests_mag-2020-01-23.json | jq .ext_ids.doi -r | rg -a -v '^null$' | wc -l
+ => previously 15,707,405
+ => 15,702,581
+
+ cat ingest_requests_mag-2020-01-23.pmid.json | jq .base_url -r | rg ' ' | wc -l
+ => 0
+ URL encoding seems to be working
+
+## Persist Ingest Requests
+
+First pmid ingest requests, then the all/doi file. The reason to do this order
+is that the all/doi file will have some rows with no DOI (and thus no
+`ext_id`), while the PMID file will not.
+
+ # small sample
+ head /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
+ Worker: Counter({'total': 10, 'skip-result-fields': 10})
+ JSON lines pushed: Counter({'total': 10, 'pushed': 10})
+
+ cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
+ => 4.3M 0:16:46 [4.27k/s]
+ Worker: Counter({'total': 4295026, 'insert-requests': 4241862, 'update-requests': 0})
+ JSON lines pushed: Counter({'total': 4295026, 'pushed': 4295026})
+ => hit a bug on first attempt, which is why total/insert results don't match
+
+ cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.doi.json | ./persist_tool.py ingest-request -
+ => 25.6M 2:21:54 [3.01k/s]
+ Worker: Counter({'total': 25596559, 'insert-requests': 21348393, 'update-requests': 0})
+ JSON lines pushed: Counter({'pushed': 25596559, 'total': 25596559})
+
+
+## Crawl/Dupe Status
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+After just PMID links:
+
+ status | count
+ ---------------------+---------
+ | 3000115
+ success | 1126881
+ no-capture | 69459
+ terminal-bad-status | 30259
+ redirect-loop | 11656
+ no-pdf-link | 2836
+ wrong-mimetype | 1456
+ link-loop | 1259
+ wayback-error | 1232
+ cdx-error | 932
+ null-body | 85
+ petabox-error | 50
+ bad-redirect | 1
+ (13 rows)
+
+After all links:
+
+ SELECT COUNT(*)
+ FROM ingest_request
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag';
+ => 25596563
+
+
+ status | count
+ ---------------------+----------
+ | 21130841
+ success | 3915682
+ no-capture | 391813
+ terminal-bad-status | 76488
+ redirect-loop | 44202
+ wrong-mimetype | 16418
+ no-pdf-link | 10995
+ wayback-error | 3679
+ cdx-error | 3414
+ link-loop | 2098
+ null-body | 709
+ petabox-error | 221
+ bad-gzip-encoding | 2
+ bad-redirect | 1
+ (14 rows)
+
+Somewhat more un-ingested than expected.
+
+Dump requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/mag_noingest_20200305.rows.json';
+ => COPY 21,130,841
+
+Transform and shuf:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_noingest_20200305.rows.json | pv -l | shuf | gzip > /grande/snapshots/mag_noingest_20200305.shuf.json.gz
+ => 21.1M 0:18:57 [18.6k/s]
+
+## Bulk Ingest Partner Output
+
+These are subsets of the full list from potential AIT-S partners; want to run
+these through the pipeline before the full batch. Duplication against the full
+batch should be minimal.
+
+Size:
+
+ bnewbold@ia601101$ cat ingest_requests_mag-2020-01-23.cornell.json | jq .ext_ids.doi | rg -v '^null$' | wc -l
+ 29007
+ bnewbold@ia601101$ wc -l ingest_requests_mag-2020-01-23.cornell.json
+ 34265 ingest_requests_mag-2020-01-23.cornell.json
+
+Test ingest:
+
+ head -n200 ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full ingests:
+
+ cat ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.alberta.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.columbia.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.emory.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.stanford.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Bulk Ingest
+
+Shard it into batches of roughly 1 million:
+
+ cd /grande/snapshots/
+ zcat /grande/snapshots/mag_noingest_20200305.shuf.json.gz | split -n r/20 -d - mag_noingest_20200305.ingest_request.split_ --additional-suffix=.json
+
+Add a single batch like:
+
+ cat mag_noingest_20200305.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ partner ingests (see above)
+ => 2020-03-05 12:49: 118,396
+ 1056543 mag_noingest_20200305.ingest_request.split_00.json
+ => 2020-03-05 14:34: 1,055,224
+ => check on stats/ratios; filter by ingest update time?
+ 1056542 mag_noingest_20200305.ingest_request.split_01.json
+ 1056542 mag_noingest_20200305.ingest_request.split_02.json
+ 1056542 mag_noingest_20200305.ingest_request.split_03.json
+ 1056542 mag_noingest_20200305.ingest_request.split_04.json
+ 1056542 mag_noingest_20200305.ingest_request.split_05.json
+ 1056542 mag_noingest_20200305.ingest_request.split_06.json
+ 1056542 mag_noingest_20200305.ingest_request.split_07.json
+ 1056542 mag_noingest_20200305.ingest_request.split_08.json
+ 1056542 mag_noingest_20200305.ingest_request.split_09.json
+ => 2020-03-05 18:04: 10,009,297
+ => 2020-03-06 16:53: 6,553,946
+ 1056542 mag_noingest_20200305.ingest_request.split_10.json
+ 1056542 mag_noingest_20200305.ingest_request.split_11.json
+ 1056542 mag_noingest_20200305.ingest_request.split_12.json
+ 1056542 mag_noingest_20200305.ingest_request.split_13.json
+ 1056542 mag_noingest_20200305.ingest_request.split_14.json
+ 1056542 mag_noingest_20200305.ingest_request.split_15.json
+ 1056542 mag_noingest_20200305.ingest_request.split_16.json
+ 1056542 mag_noingest_20200305.ingest_request.split_17.json
+ 1056542 mag_noingest_20200305.ingest_request.split_18.json
+ 1056542 mag_noingest_20200305.ingest_request.split_19.json
+ => 2020-03-06 16:59: 17,001,032
+
+Stats from bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ ---------------------+----------
+ no-capture | 12237193
+ success | 11991293
+ no-pdf-link | 521691
+ redirect-loop | 437192
+ terminal-bad-status | 231181
+ link-loop | 92633
+ cdx-error | 33631
+ wrong-mimetype | 28638
+ wayback-error | 19651
+ null-body | 2682
+ petabox-error | 727
+ | 47
+ bad-redirect | 44
+ bad-gzip-encoding | 7
+ (14 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ --------------------------------------+---------------------+--------
+ dialnet.unirioja.es | redirect-loop | 240967
+ onlinelibrary.wiley.com | no-pdf-link | 147696
+ agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639
+ iopscience.iop.org | terminal-bad-status | 69591
+ febs.onlinelibrary.wiley.com | no-pdf-link | 49874
+ www.researchgate.net | redirect-loop | 42859
+ journals.sagepub.com | no-pdf-link | 27448
+ papers.ssrn.com | redirect-loop | 27328
+ dialnet.unirioja.es | terminal-bad-status | 20320
+ physoc.onlinelibrary.wiley.com | no-pdf-link | 20232
+ science.sciencemag.org | link-loop | 17811
+ espace.library.uq.edu.au | redirect-loop | 17185
+ bpspubs.onlinelibrary.wiley.com | no-pdf-link | 15785
+ obgyn.onlinelibrary.wiley.com | no-pdf-link | 15301
+ anthrosource.onlinelibrary.wiley.com | no-pdf-link | 13746
+ www.tandfonline.com | no-pdf-link | 13303
+ aasldpubs.onlinelibrary.wiley.com | no-pdf-link | 11070
+ link.springer.com | redirect-loop | 10594
+ www.redalyc.org:9081 | no-pdf-link | 10515
+ watermark.silverchair.com | terminal-bad-status | 9739
+ www.bmj.com | link-loop | 9389
+ www.repository.naturalis.nl | redirect-loop | 8213
+ bjp.rcpsych.org | link-loop | 8045
+ aslopubs.onlinelibrary.wiley.com | no-pdf-link | 7814
+ nph.onlinelibrary.wiley.com | no-pdf-link | 7801
+ iopscience.iop.org | redirect-loop | 7697
+ journals.tubitak.gov.tr | wrong-mimetype | 7159
+ www.biorxiv.org | wrong-mimetype | 7067
+ www.erudit.org | redirect-loop | 6819
+ besjournals.onlinelibrary.wiley.com | no-pdf-link | 6254
+ (30 rows)
+
+Domains to follow-up (eg, sandcrawler ingest tests/tweaks):
+- dialnet.unirioja.es | redirect-loop | 240967
+- www.researchgate.net | redirect-loop | 42859
+- www.redalyc.org:9081 | no-pdf-link | 10515
+- www.repository.naturalis.nl | redirect-loop | 8213
+- bjp.rcpsych.org | link-loop | 8045
+- journals.tubitak.gov.tr | wrong-mimetype | 7159
+- www.erudit.org | redirect-loop | 6819
+
+The dialnet.unirioja.es ones may be worth re-crawling via heritrix?
+
+Top uncrawled domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ---------------------------------+------------+--------
+ ieeexplore.ieee.org | no-capture | 957835
+ link.springer.com | no-capture | 394121
+ www.researchgate.net | no-capture | 376974
+ cyberleninka.ru | no-capture | 376012
+ iopscience.iop.org | no-capture | 348791
+ papers.ssrn.com | no-capture | 286860
+ dergipark.org.tr | no-capture | 217556
+ dialnet.unirioja.es | no-capture | 214398
+ academic.oup.com | no-capture | 212364
+ www.tandfonline.com | no-capture | 148940
+ journals.sagepub.com | no-capture | 144695
+ www.papersearch.net | no-capture | 138986
+ absimage.aps.org | no-capture | 111976
+ apps.dtic.mil | no-capture | 106984
+ www.cambridge.org | no-capture | 97533
+ www.bmj.com | no-capture | 92437
+ bioone.org | no-capture | 87573
+ science.sciencemag.org | no-capture | 75723
+ shodhganga.inflibnet.ac.in:8080 | no-capture | 75395
+ www.jstor.org | no-capture | 73230
+ works.bepress.com | no-capture | 68747
+ www.scielo.org.co | no-capture | 59650
+ hrcak.srce.hr | no-capture | 59332
+ muse.jhu.edu | no-capture | 57828
+ onlinelibrary.wiley.com | no-capture | 55621
+ www.jbc.org | no-capture | 54608
+ www.jstage.jst.go.jp | no-capture | 53631
+ www.redalyc.org | no-capture | 50406
+ lup.lub.lu.se | no-capture | 47469
+ www.dtic.mil | no-capture | 41820
+ (30 rows)
+
+## Heritrix Seedlist Generation
+
+Dump ingest requests (filtered for some domains that don't expect to crawl via
+heritrix):
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/mag_nocapture_20200313.rows.json';
+ => COPY 11714199
+
+ # in sandcrawler pipenv
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json
+
+## Bulk Ingest of Heritrix Content
+
+Small sample:
+
+ head -n 1000 mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full run:
+
+ cat mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ 2020-04-07 12:19 (pacific): 11,703,871
+
+## Post-bulk-ingest
+
+Around 2020-04-28, seems like main wave of bulk ingest is complete. Will need
+to re-try things like cdx-error.
+
+Current status:
+
+ status | count
+ -------------------------------+----------
+ success | 18491799
+ redirect-loop | 1968530
+ no-capture | 1373657
+ no-pdf-link | 1311842
+ link-loop | 1296439
+ terminal-bad-status | 627577
+ cdx-error | 418278
+ wrong-mimetype | 50141
+ wayback-error | 37159
+ petabox-error | 11249
+ null-body | 6295
+ gateway-timeout | 3051
+ spn2-cdx-lookup-failure | 328
+ spn2-error:invalid-url-syntax | 93
+ bad-redirect | 75
+ | 47
+ invalid-host-resolution | 28
+ spn2-error | 10
+ bad-gzip-encoding | 7
+ redirects-exceeded | 2
+ (20 rows)
+
+Lots of cdx-error to retry.
+
+The no-capture links are probably a mix of domain-blocklist and things that
+failed in bulk mode. Will dump and re-attempt them:
+
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/mag_nocapture_20200420.rows.json';
+ => 859849
+
+What domains are these?
+
+ cat mag_nocapture_20200420.rows.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n30
+
+Let's filter down more:
+
+ cat mag_nocapture_20200420.rows.json | rg -v 'www.researchgate.net' | rg -v 'muse.jhu.edu' | rg -v 'www.omicsonline.org' | rg -v 'link.springer.com' | rg -v 'iopscience.iop.org' | rg -v 'ieeexplore.ieee.org' | shuf > mag_nocapture_20200420.rows.filtered.json
+
+ wc -l mag_nocapture_20200420.rows.filtered.json
+ 423085 mag_nocapture_20200420.rows.filtered.json
+
+Ok, enqueue!
+
+ cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## Final Stats
+
+... for this round of ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------------------+----------
+ success | 18712849
+ redirect-loop | 2008110
+ no-pdf-link | 1337012
+ link-loop | 1326761
+ no-capture | 1030693
+ terminal-bad-status | 637143
+ gateway-timeout | 193194
+ cdx-error | 125907
+ spn2-cdx-lookup-failure | 77842
+ wrong-mimetype | 50882
+ wayback-error | 40278
+ invalid-host-resolution | 35201
+ petabox-error | 11254
+ null-body | 6485
+ spn2-error | 1643
+ spn2-error:job-failed | 747
+ spn2-error:invalid-url-syntax | 325
+ spn2-error:soft-time-limit-exceeded | 190
+ bad-redirect | 77
+ | 47
+ (20 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ domain | status | count
+ ---------------------------------+---------------------+--------
+ ieeexplore.ieee.org | redirect-loop | 677712
+ cyberleninka.ru | link-loop | 308390
+ papers.ssrn.com | link-loop | 281804
+ ieeexplore.ieee.org | link-loop | 273559
+ dialnet.unirioja.es | redirect-loop | 240504
+ dialnet.unirioja.es | terminal-bad-status | 232481
+ onlinelibrary.wiley.com | no-pdf-link | 220932
+ iopscience.iop.org | terminal-bad-status | 172480
+ validate.perfdrive.com | no-pdf-link | 172312
+ link.springer.com | redirect-loop | 130398
+ agupubs.onlinelibrary.wiley.com | no-pdf-link | 113382
+ iopscience.iop.org | redirect-loop | 105234
+ www.bmj.com | link-loop | 100354
+ www.researchgate.net | redirect-loop | 84366
+ www.cambridge.org | link-loop | 83171
+ jamanetwork.com | no-pdf-link | 75053
+ febs.onlinelibrary.wiley.com | no-pdf-link | 74872
+ www.jstor.org | redirect-loop | 72059
+ journals.sagepub.com | no-pdf-link | 63028
+ science.sciencemag.org | redirect-loop | 62927
+ profile.thieme.de | no-pdf-link | 62406
+ cyberleninka.ru | redirect-loop | 56733
+ link.springer.com | link-loop | 47608
+ physoc.onlinelibrary.wiley.com | no-pdf-link | 30180
+ science.sciencemag.org | link-loop | 29908
+ papers.ssrn.com | redirect-loop | 27255
+ obgyn.onlinelibrary.wiley.com | no-pdf-link | 26789
+ www.computer.org | no-pdf-link | 26444
+ watermark.silverchair.com | terminal-bad-status | 25934
+ www.nature.com | redirect-loop | 25306
+ (30 rows)
diff --git a/notes/ingest/2020-03_s2.md b/notes/ingest/2020-03_s2.md
new file mode 100644
index 0000000..fedaba0
--- /dev/null
+++ b/notes/ingest/2020-03_s2.md
@@ -0,0 +1,35 @@
+
+Crawled some 6 million new PDFs from pdfs.semanticscholar.org. Should get these
+ingested, as well as any previous existing content.
+
+Also, there are a bunch of PDF outlinks to the web; should do S2-specific
+matching and ingest of those.
+
+There are a few categories of paper from pdfs.s.o:
+
+1. we had previous GWB crawl, didn't re-crawl
+2. we had PDF from elsewhere on the web, didn't re-crawl
+3. crawled successfully
+4. crawl failed
+
+In this ingest, want to get all of categories 1 and 3. Could try to do this by
+dumping sandcrawler CDX table matching pdfs.s.o (which includes recent crawl),
+and join that against the ingest request list.
+
+For other random web URLs, can do the usual persist/backfill/recrawl pipeline.
+
+## Create Seedlist
+
+ zcat s2-corpus-pdfUrls.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-pdfUrls.2019.ingest_request.json.gz
+ zcat s2-corpus-s2PdfUrl.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-s2PdfUrl.2019.ingest_request.json.gz
+
+ zcat s2-corpus-s2PdfUrl.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-s2PdfUrl.id_list
+ zcat s2-corpus-pdfUrls.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-pdfUrls.id_list
+
+ zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_hosted_ingestrequest.json.gz
+ zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg -v pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_external_ingestrequest.json.gz
+
+ zcat s2_external_ingestrequest.json.gz | wc -l
+ 41201427
+ zcat s2_hosted_ingestrequest.json.gz | wc -l
+ 23345761
diff --git a/notes/ingest/2020-04-13_covid19.md b/notes/ingest/2020-04-13_covid19.md
new file mode 100644
index 0000000..b442d69
--- /dev/null
+++ b/notes/ingest/2020-04-13_covid19.md
@@ -0,0 +1,73 @@
+
+Want to ensure seedlists from Wanfang and CNKI are captured in wayback.
+
+Wanfang URLs seem normal. Let's just submit them in a single queue via SPNv2.
+They are heterogenous after redirect.
+
+CNKI are trickier. The PDF URLs definitely can't be crawled directly... but the
+info ones probably can, then crawl on to PDF? At least some seem to capture Ok.
+
+Need scope and identifiers for ingest requests. Let's do:
+
+ cnki_covid19 / <ident>
+ wanfang_covid19 / <ident>
+
+Source: scrape-covid19
+
+## Commands
+
+ # in sandcrawler pipenv
+ cat ~/code/covid19.fatcat.wiki/extra/scrape/cnki_metadata.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/cnki_ingest_request.2020-04-14.json
+ cat ~/code/covid19.fatcat.wiki/extra/scrape/wanfang*.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/wanfang_ingest_request.2020-04-14.json
+
+
+ cat /tmp/wanfang_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 4
+ cat /tmp/cnki_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 8
+
+## Status
+
+ SELECT ingest_request.ingest_type,
+ ingest_file_result.status,
+ COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'scrape-covid19'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.status
+ ORDER BY COUNT(*) DESC;
+
+2020-04-15:
+
+ ingest_type | status | count
+ -------------+-------------------------------------+-------
+ pdf | spn2-cdx-lookup-failure | 1588
+ pdf | success | 671
+ pdf | gateway-timeout | 507
+ pdf | no-pdf-link | 181
+ pdf | wayback-error | 30
+ pdf | spn2-error:job-failed | 20
+ pdf | spn2-error | 7
+ pdf | spn2-error:soft-time-limit-exceeded | 3
+ pdf | spn2-error:pending | 2
+ (9 rows)
+
+## Re-Try
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'scrape-covid19'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status != 'no-pdf-link'
+ AND ingest_file_result.status != 'link-loop'
+ ) TO '/grande/snapshots/reingest_covid19.rows.json';
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_covid19.rows.json | shuf > reingest_covid19.json
+
+ cat reingest_covid19.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 9
+
diff --git a/notes/ingest/2020-04_datacite.md b/notes/ingest/2020-04_datacite.md
new file mode 100644
index 0000000..0fc7e67
--- /dev/null
+++ b/notes/ingest/2020-04_datacite.md
@@ -0,0 +1,121 @@
+
+After the broad datacite crawl, want to ingest paper PDFs into fatcat. But many
+of the DOIs are for, eg, datasets, and don't want to waste time on those.
+
+Instead of using full ingest request file from the crawl, will generate a new
+ingest request file using `fatcat_ingest.py` and set that up for bulk crawling.
+
+## Generate Requests
+
+ ./fatcat_ingest.py --allow-non-oa --release-types article-journal,paper-conference,article,report,thesis,book,chapter query "doi_registrar:datacite" | pv -l > /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json
+ => Expecting 8905453 release objects in search queries
+ => 8.91M 11:49:50 [ 209 /s]
+ => Counter({'elasticsearch_release': 8905453, 'ingest_request': 8905453, 'estimate': 8905453})
+
+## Bulk Ingest
+
+ cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Ingest Stats
+
+Note that this will have a small fraction of non-datacite results mixed in (eg,
+from COVID-19 targeted crawls):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ AND created >= '2020-04-07'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+---------
+ no-pdf-link | 4646767
+ redirect-loop | 1447229
+ no-capture | 860235
+ success | 849501
+ terminal-bad-status | 174869
+ cdx-error | 159805
+ wayback-error | 18076
+ wrong-mimetype | 11169
+ link-loop | 8410
+ gateway-timeout | 4034
+ spn2-cdx-lookup-failure | 510
+ petabox-error | 339
+ null-body | 251
+ spn2-error | 19
+ spn2-error:job-failed | 14
+ bad-gzip-encoding | 13
+ timeout | 5
+ spn2-error:soft-time-limit-exceeded | 4
+ invalid-host-resolution | 2
+ spn2-error:pending | 1
+ (20 rows)
+
+Top domains/statuses (including success):
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ AND created >= '2020-04-07'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ---------------------------------------+---------------------+--------
+ ssl.fao.org | no-pdf-link | 862277
+ www.e-periodica.ch | no-pdf-link | 746781
+ www.researchgate.net | redirect-loop | 664524
+ dlc.library.columbia.edu | no-pdf-link | 493111
+ www.die-bonn.de | redirect-loop | 352903
+ figshare.com | no-pdf-link | 319709
+ statisticaldatasets.data-planet.com | no-pdf-link | 309584
+ catalog.paradisec.org.au | redirect-loop | 225396
+ zenodo.org | no-capture | 193201
+ digi.ub.uni-heidelberg.de | no-pdf-link | 184974
+ open.library.ubc.ca | no-pdf-link | 167841
+ zenodo.org | no-pdf-link | 130617
+ www.google.com | no-pdf-link | 111312
+ www.e-manuscripta.ch | no-pdf-link | 79192
+ ds.iris.edu | no-pdf-link | 77649
+ data.inra.fr | no-pdf-link | 69440
+ www.tib.eu | no-pdf-link | 63872
+ www.egms.de | redirect-loop | 53877
+ archaeologydataservice.ac.uk | redirect-loop | 52838
+ d.lib.msu.edu | no-pdf-link | 45297
+ www.e-rara.ch | no-pdf-link | 45163
+ springernature.figshare.com | no-pdf-link | 42527
+ boris.unibe.ch | no-pdf-link | 40816
+ www.research-collection.ethz.ch | no-capture | 40350
+ spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 33059
+ repository.dri.ie | terminal-bad-status | 32760
+ othes.univie.ac.at | no-pdf-link | 32558
+ repositories.lib.utexas.edu | no-capture | 31526
+ posterng.netkey.at | no-pdf-link | 30315
+ zenodo.org | terminal-bad-status | 29614
+ (30 rows)
+
diff --git a/notes/ingest/2020-04_unpaywall.md b/notes/ingest/2020-04_unpaywall.md
new file mode 100644
index 0000000..a5e3bb1
--- /dev/null
+++ b/notes/ingest/2020-04_unpaywall.md
@@ -0,0 +1,312 @@
+
+A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but
+not released for more than a month).
+
+Primary goal is:
+
+- generate ingest requests for only *new* URLs
+- bulk ingest these new URLs
+- crawl any no-capture URLs from that batch
+- re-bulk-ingest the no-capture batch
+- analytics on failed ingests. eg, any particular domains that are failing to crawl
+
+This ingest pipeline was started on 2020-04-07 by bnewbold.
+
+Ran through the first two steps again on 2020-05-03 after unpaywall had
+released another dump (dated 2020-04-27).
+
+## Transform and Load
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json
+ => 24.7M 5:17:03 [ 1.3k/s]
+
+ cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => 24.7M
+ => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0})
+
+Second time:
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-04-27T153236.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json
+ => 25.2M 3:16:28 [2.14k/s]
+
+ cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 25189390, 'insert-requests': 1408915, 'update-requests': 0})
+ => JSON lines pushed: Counter({'pushed': 25189390, 'total': 25189390})
+
+
+## Dump new URLs and Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json';
+ => 3696189
+
+ WARNING: forgot to transform from rows to ingest requests.
+
+ cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Second time:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/unpaywall_noingest_2020-05-03.rows.json';
+ => 1799760
+
+ WARNING: forgot to transform from rows to ingest requests.
+
+ cat /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Dump no-capture, Run Crawl
+
+Make two ingest request dumps: one with "all" URLs, which we will have heritrix
+attempt to crawl, and then one with certain domains filtered out, which we may
+or may not bother trying to ingest (due to expectation of failure).
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status = 'no-capture'
+ ) TO '/grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json';
+ => 2734145
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json';
+ => 2602408
+
+NOTE: forgot here to transform from "rows" to ingest requests.
+
+Not actually a very significant size difference after all.
+
+See `journal-crawls` repo for details on seedlist generation and crawling.
+
+## Re-Ingest Post-Crawl
+
+NOTE: if we *do* want to do cleanup eventually, could look for fatcat edits
+between 2020-04-01 and 2020-05-25 which have limited "extra" metadata (eg, no
+evidence or `oa_status`).
+
+The earlier bulk ingests were done wrong (forgot to transform from rows to full
+ingest request docs), so going to re-do those, which should be a superset of
+the nocapture crawl URLs.:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | pv -l > /grande/snapshots/unpaywall_noingest_2020-04-08.json
+ => 1.26M 0:00:58 [21.5k/s]
+ => previously: 3,696,189
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | pv -l > /grande/snapshots/unpaywall_noingest_2020-05-03.json
+ => 1.26M 0:00:56 [22.3k/s]
+
+Crap, looks like the 2020-04-08 segment got overwriten with 2020-05 data by
+accident. Hrm... need to re-ingest *all* recent unpaywall URLs:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ ) TO '/grande/snapshots/unpaywall_all_recent_requests_2020-05-26.rows.json';
+ => COPY 5691106
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json
+ => 5.69M 0:04:26 [21.3k/s]
+
+Start small:
+
+ cat /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Looks good (whew), run the full thing:
+
+ cat /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Post-ingest stats (2020-08-28)
+
+Overall status:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 22063013
+ no-pdf-link | 2192606
+ redirect-loop | 1471135
+ terminal-bad-status | 995106
+ no-capture | 359440
+ cdx-error | 358909
+ wrong-mimetype | 111685
+ wayback-error | 50705
+ link-loop | 29359
+ null-body | 13667
+ gateway-timeout | 3689
+ spn2-cdx-lookup-failure | 1229
+ petabox-error | 1007
+ redirects-exceeded | 747
+ invalid-host-resolution | 464
+ spn2-error | 107
+ spn2-error:job-failed | 91
+ bad-redirect | 26
+ spn2-error:soft-time-limit-exceeded | 9
+ bad-gzip-encoding | 5
+ (20 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ -----------------------------------+---------------------+--------
+ academic.oup.com | no-pdf-link | 415441
+ watermark.silverchair.com | terminal-bad-status | 345937
+ www.tandfonline.com | no-pdf-link | 262488
+ journals.sagepub.com | no-pdf-link | 235707
+ onlinelibrary.wiley.com | no-pdf-link | 225876
+ iopscience.iop.org | terminal-bad-status | 170783
+ www.nature.com | redirect-loop | 145522
+ www.degruyter.com | redirect-loop | 131898
+ files-journal-api.frontiersin.org | terminal-bad-status | 126091
+ pubs.acs.org | no-pdf-link | 119223
+ society.kisti.re.kr | no-pdf-link | 112401
+ www.ahajournals.org | no-pdf-link | 105953
+ dialnet.unirioja.es | terminal-bad-status | 96505
+ www.cell.com | redirect-loop | 87560
+ www.ncbi.nlm.nih.gov | redirect-loop | 49890
+ ageconsearch.umn.edu | redirect-loop | 45989
+ ashpublications.org | no-pdf-link | 45833
+ pure.mpg.de | redirect-loop | 45278
+ www.degruyter.com | terminal-bad-status | 43642
+ babel.hathitrust.org | terminal-bad-status | 42057
+ osf.io | redirect-loop | 41119
+ scialert.net | no-pdf-link | 39009
+ dialnet.unirioja.es | redirect-loop | 38839
+ www.jci.org | redirect-loop | 34209
+ www.spandidos-publications.com | redirect-loop | 33167
+ www.journal.csj.jp | no-pdf-link | 30915
+ journals.openedition.org | redirect-loop | 30409
+ www.valueinhealthjournal.com | redirect-loop | 30090
+ dergipark.org.tr | no-pdf-link | 29146
+ journals.ametsoc.org | no-pdf-link | 29133
+ (30 rows)
+
+Enqueue internal failures for re-ingest:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/unpaywall_errors_2020-08-28.rows.json';
+ => 409606
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_errors_2020-08-28.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_errors_2020-08-28.requests.json
+
+ cat /grande/snapshots/unpaywall_errors_2020-08-28.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+And after *that* (which ran quickly):
+
+ status | count
+ -------------------------------------+----------
+ success | 22281874
+ no-pdf-link | 2258352
+ redirect-loop | 1499251
+ terminal-bad-status | 1004781
+ no-capture | 401333
+ wrong-mimetype | 112068
+ cdx-error | 32259
+ link-loop | 30137
+ null-body | 13886
+ wayback-error | 11653
+ gateway-timeout | 3689
+ spn2-cdx-lookup-failure | 1229
+ petabox-error | 1036
+ redirects-exceeded | 749
+ invalid-host-resolution | 464
+ spn2-error | 107
+ spn2-error:job-failed | 91
+ bad-redirect | 26
+ spn2-error:soft-time-limit-exceeded | 9
+ bad-gzip-encoding | 5
+ (20 rows)
+
+22063013 -> 22281874 = + 218,861 success, not bad!
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md
new file mode 100644
index 0000000..fe22c75
--- /dev/null
+++ b/notes/ingest/2020-05_oai_pmh.md
@@ -0,0 +1,428 @@
+
+Primary Goal: start large crawl of OAI landing pages that we haven't seen
+
+Fields of interest for ingest:
+- oai identifer
+- doi
+- formats
+- urls (maybe also "relations")
+- types (type+stage)
+
+## Other Tasks
+
+About 150 million total lines.
+
+Types coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt
+
+Dump all ISSNs, with counts, quick check how many are in chocula/fatcat
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt
+
+Language coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt
+
+Format coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt
+ => 150M 0:56:14 [44.7k/s]
+
+Have a DOI?
+
+ zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l
+ => 16,013,503
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt
+ => 11,940,950
+
+## Transform, Load, Bulk Ingest
+
+ zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz
+ => 80M 6:36:55 [3.36k/s]
+
+ time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request -
+ => 80M 4:00:21 [5.55k/s]
+ => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0})
+ => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963})
+
+ => real 240m21.207s
+ => user 85m12.576s
+ => sys 3m29.580s
+
+ select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai';
+ => 51,185,088
+
+Why so many (30 million) skipped? Not unique?
+
+ zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l
+ => 51,185,088
+
+ zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt
+ wc -l request_url.txt
+ => 50,002,674 request_url.txt
+
+ zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt
+ wc -l requires_oai.txt
+ => 34,622,083 requires_oai.txt
+
+Yup, tons of duplication. And remember this is exact URL, not SURT or similar.
+
+How many of these are URLs we have seen and ingested already?
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ | 49491452
+ success | 1469113
+ no-capture | 134611
+ redirect-loop | 59666
+ no-pdf-link | 8947
+ cdx-error | 7561
+ terminal-bad-status | 6704
+ null-body | 5042
+ wrong-mimetype | 879
+ wayback-error | 722
+ petabox-error | 198
+ gateway-timeout | 86
+ link-loop | 51
+ invalid-host-resolution | 24
+ spn2-cdx-lookup-failure | 22
+ spn2-error | 4
+ bad-gzip-encoding | 4
+ spn2-error:job-failed | 2
+ (18 rows)
+
+Dump ingest requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/oai_noingest_20200506.rows.json';
+ => COPY 49491452
+
+ WARNING: should have transformed from rows to requests here
+
+ cat /grande/snapshots/oai_noingest_20200506.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Crawl and re-ingest
+
+Updated stats after ingest (NOTE: ingest requests not really formed correctly,
+but doesn't matter because fatcat wasn't importing these anyways):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ no-capture | 42565875
+ success | 5227609
+ no-pdf-link | 2156341
+ redirect-loop | 559721
+ cdx-error | 260446
+ wrong-mimetype | 148871
+ terminal-bad-status | 109725
+ link-loop | 92792
+ null-body | 30688
+ | 15287
+ petabox-error | 11109
+ wayback-error | 6261
+ skip-url-blocklist | 184
+ gateway-timeout | 86
+ bad-gzip-encoding | 25
+ invalid-host-resolution | 24
+ spn2-cdx-lookup-failure | 22
+ bad-redirect | 15
+ spn2-error | 4
+ spn2-error:job-failed | 2
+ (20 rows)
+
+Dump again for crawling:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND (ingest_file_result.status = 'no-capture' or ingest_file_result.status = 'cdx-error')
+ ) TO '/grande/snapshots/oai_tocrawl_20200526.rows.json';
+
+Notes about crawl setup are in `journal-crawls` repo. Excluded the following domains:
+
+ 4876135 www.kb.dk REMOVE: too large and generic
+ 3110009 kb-images.kb.dk REMOVE: dead?
+ 1274638 mdz-nbn-resolving.de REMOVE: maybe broken
+ 982312 aggr.ukm.um.si REMOVE: maybe broken
+
+And went from about 42,826,313 rows to 31,773,874 unique URLs to crawl, so
+expecting at least 11,052,439 `no-capture` ingest results (and should probably
+filter for these or even delete from the ingest request table).
+
+Ingest progress:
+
+ 2020-08-05 14:02: 32,571,018
+ 2020-08-06 13:49: 31,195,169
+ 2020-08-07 10:11: 29,986,169
+ 2020-08-10 10:43: 26,497,196
+ 2020-08-12 11:02: 23,811,845
+ 2020-08-17 13:34: 19,460,502
+ 2020-08-20 09:49: 15,069,507
+ 2020-08-25 09:56: 9,397,035
+ 2020-09-02 15:02: 305,889 (72k longest queue)
+ 2020-09-03 14:30: done
+
+## Post-ingest stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ no-capture | 16804277
+ no-pdf-link | 14895249
+ success | 13898603
+ redirect-loop | 2709730
+ cdx-error | 827024
+ terminal-bad-status | 740037
+ wrong-mimetype | 604242
+ link-loop | 532553
+ null-body | 95721
+ wayback-error | 41864
+ petabox-error | 19204
+ | 15287
+ gateway-timeout | 510
+ bad-redirect | 318
+ skip-url-blocklist | 184
+ bad-gzip-encoding | 114
+ timeout | 78
+ spn2-cdx-lookup-failure | 59
+ invalid-host-resolution | 19
+ blocked-cookie | 6
+ (20 rows)
+
+Hrm, +8 million or so 'success', but that is a lot of no-capture. May be worth
+dumping the full kafka result topic, filter to OAI requests, and extracting the
+missing URLs.
+
+Top counts by OAI prefix:
+
+ SELECT
+ oai_prefix,
+ COUNT(CASE WHEN status = 'success' THEN 1 END) as success,
+ COUNT(*) as total
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ ) t1
+ GROUP BY oai_prefix
+ ORDER BY total DESC
+ LIMIT 25;
+
+ oai_prefix | success | total
+ --------------------------+---------+---------
+ kb.dk | 0 | 7989412 (excluded)
+ repec | 1118591 | 2783448
+ bnf.fr | 0 | 2187277
+ hispana.mcu.es | 19404 | 1492639
+ bdr.oai.bsb-muenchen.de | 73 | 1319882 (excluded?)
+ hal | 564700 | 1049607
+ ukm.si | 0 | 982468 (excluded)
+ hsp.org | 0 | 810281
+ www.irgrid.ac.cn | 17578 | 748828
+ cds.cern.ch | 72811 | 688091
+ americanae.aecid.es | 69678 | 572792
+ biodiversitylibrary.org | 2121 | 566154
+ juser.fz-juelich.de | 22777 | 518551
+ espace.library.uq.edu.au | 6494 | 508960
+ igi.indrastra.com | 58689 | 478577
+ archive.ugent.be | 63654 | 424014
+ hrcak.srce.hr | 395031 | 414897
+ zir.nsk.hr | 153889 | 397200
+ renati.sunedu.gob.pe | 78399 | 388355
+ hypotheses.org | 3 | 374296
+ rour.neicon.ru | 7963 | 354529
+ generic.eprints.org | 261221 | 340470
+ invenio.nusl.cz | 6184 | 325867
+ evastar-karlsruhe.de | 62044 | 317952
+ quod.lib.umich.edu | 5 | 309135
+ (25 rows)
+
+Top counts by OAI prefix and status:
+
+ SELECT
+ oai_prefix,
+ status,
+ COUNT((oai_prefix,status))
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ ) t1
+ GROUP BY oai_prefix, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ oai_prefix | status | count
+ --------------------------+---------------+---------
+ kb.dk | no-capture | 7955231 (excluded)
+ bdr.oai.bsb-muenchen.de | no-capture | 1270209 (excluded?)
+ repec | success | 1118591
+ hispana.mcu.es | no-pdf-link | 1118092
+ bnf.fr | no-capture | 1100591
+ ukm.si | no-capture | 976004 (excluded)
+ hsp.org | no-pdf-link | 773496
+ repec | no-pdf-link | 625629
+ bnf.fr | no-pdf-link | 607813
+ hal | success | 564700
+ biodiversitylibrary.org | no-pdf-link | 531409
+ cds.cern.ch | no-capture | 529842
+ repec | redirect-loop | 504393
+ juser.fz-juelich.de | no-pdf-link | 468813
+ bnf.fr | redirect-loop | 436087
+ americanae.aecid.es | no-pdf-link | 409954
+ hrcak.srce.hr | success | 395031
+ www.irgrid.ac.cn | no-pdf-link | 362087
+ hal | no-pdf-link | 352111
+ www.irgrid.ac.cn | no-capture | 346963
+ espace.library.uq.edu.au | no-pdf-link | 315302
+ igi.indrastra.com | no-pdf-link | 312087
+ repec | no-capture | 309882
+ invenio.nusl.cz | no-pdf-link | 302657
+ hypotheses.org | no-pdf-link | 298750
+ rour.neicon.ru | redirect-loop | 291922
+ renati.sunedu.gob.pe | no-capture | 276388
+ t2r2.star.titech.ac.jp | no-pdf-link | 264109
+ generic.eprints.org | success | 261221
+ quod.lib.umich.edu | no-pdf-link | 253937
+ (30 rows)
+
+If we remove excluded prefixes, and some large/generic prefixes (bnf.fr,
+hispana.mcu.es, hsp.org), then the aggregate counts are:
+
+ no-capture | 16,804,277 -> 5,502,242
+ no-pdf-link | 14,895,249 -> 12,395,848
+
+Top status by terminal domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ----------------------------------+---------------+--------
+ hispana.mcu.es | no-pdf-link | 709701 (national scope)
+ gallica.bnf.fr | no-pdf-link | 601193 (national scope)
+ discover.hsp.org | no-pdf-link | 524212 (historical)
+ www.biodiversitylibrary.org | no-pdf-link | 479288
+ gallica.bnf.fr | redirect-loop | 435981 (national scope)
+ hrcak.srce.hr | success | 389673
+ hemerotecadigital.bne.es | no-pdf-link | 359243
+ juser.fz-juelich.de | no-pdf-link | 345112
+ espace.library.uq.edu.au | no-pdf-link | 304299
+ invenio.nusl.cz | no-pdf-link | 302586
+ igi.indrastra.com | no-pdf-link | 292006
+ openrepository.ru | redirect-loop | 291555
+ hal.archives-ouvertes.fr | success | 278134
+ t2r2.star.titech.ac.jp | no-pdf-link | 263971
+ bib-pubdb1.desy.de | no-pdf-link | 254879
+ quod.lib.umich.edu | no-pdf-link | 250382
+ encounters.hsp.org | no-pdf-link | 248132
+ americanae.aecid.es | no-pdf-link | 245295
+ www.irgrid.ac.cn | no-pdf-link | 242496
+ publikationen.bibliothek.kit.edu | no-pdf-link | 222041
+ www.sciencedirect.com | no-pdf-link | 211756
+ dialnet.unirioja.es | redirect-loop | 203615
+ edoc.mpg.de | no-pdf-link | 195526
+ bibliotecadigital.jcyl.es | no-pdf-link | 184671
+ hal.archives-ouvertes.fr | no-pdf-link | 183809
+ www.sciencedirect.com | redirect-loop | 173439
+ lup.lub.lu.se | no-pdf-link | 165788
+ orbi.uliege.be | no-pdf-link | 158313
+ www.erudit.org | success | 155986
+ lib.dr.iastate.edu | success | 153384
+ (30 rows)
+
+Follow-ups are TBD but could include:
+- crawling the ~5m no-capture links directly (eg, not `base_url`) from the
+ ingest result JSON, while retaining the ingest request for later re-ingest
+- investigating and iterating on PDF link extraction, both for large platforms
+ and randomly sampled from long tail
+- classifying OAI prefixes by type (subject repository, institutional
+ repository, journal, national-library, historical docs, greylit, law, etc)
+- running pdftrio over some/all of this corpus
diff --git a/notes/ingest/2020-05_pubmed.md b/notes/ingest/2020-05_pubmed.md
new file mode 100644
index 0000000..36d00a1
--- /dev/null
+++ b/notes/ingest/2020-05_pubmed.md
@@ -0,0 +1,10 @@
+
+From ARXIV-PUBMEDCENTRAL-CRAWL-2020-04, on fatcat-prod1.
+
+Test small batch:
+
+ zcat ingest_file_pmcid_20200424.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Run the whole batch:
+
+ zcat ingest_file_pmcid_20200424.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2020-07_mag.md b/notes/ingest/2020-07_mag.md
new file mode 100644
index 0000000..1d33162
--- /dev/null
+++ b/notes/ingest/2020-07_mag.md
@@ -0,0 +1,353 @@
+
+Using 2020-06-25 upstream MAG corpus snapshot.
+
+Ran munging from `scratch:ingest/mag` notes first.
+
+Expecting a couple million new ingest request URLs; this is the first "patch"
+MAG ingest on top of existing already-run requests.
+
+Planning to skip the initial bulk ingest step, on the assumption that new URLs
+have either been ingested already (eg, via continuous ingest pipeline) or need
+crawling.
+
+## Generate Requests
+
+ export LC_ALL=C
+ cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 | pv -l > ingest_requests_mag-2020-06-25.json
+ => 28.7M 2:36:48 [3.06k/s]
+
+ export LC_ALL=C
+ zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 --pmid | pv -l > ingest_requests_mag-2020-06-25.pmid.json
+ => 5.66M 0:29:28 [ 3.2k/s]
+
+## Persist Ingest Requests
+
+ # small sample
+ head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 1000, 'insert-requests': 319, 'update-requests': 0})
+
+ head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request -
+ Worker: Counter({'total': 1000, 'insert-requests': 304, 'update-requests': 0})
+
+ cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 5662486, 'insert-requests': 1984605, 'update-requests': 0})
+
+ cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 28743819, 'insert-requests': 7433465, 'update-requests': 0})
+
+## Crawl/Dupe Status
+
+Overall status for old and new seeds, filtering out large (blocking)
+publishers:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 19477651
+ | 8238898
+ redirect-loop | 2036494
+ link-loop | 1330036
+ no-pdf-link | 1304820
+ terminal-bad-status | 648150
+ no-capture | 545785
+ gateway-timeout | 200143
+ cdx-error | 149995
+ spn2-cdx-lookup-failure | 80010
+ wrong-mimetype | 57052
+ wayback-error | 41032
+ invalid-host-resolution | 37203
+ petabox-error | 11167
+ null-body | 6662
+ spn2-error | 1698
+ spn2-error:job-failed | 775
+ spn2-error:invalid-url-syntax | 335
+ spn2-error:soft-time-limit-exceeded | 191
+ bad-redirect | 77
+ (20 rows)
+
+Just the new seeds:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.created > '2020-06-20'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+---------
+ | 8238851
+ success | 787174
+ no-capture | 42864
+ redirect-loop | 31718
+ terminal-bad-status | 31493
+ no-pdf-link | 13025
+ cdx-error | 11275
+ wrong-mimetype | 6238
+ link-loop | 3365
+ wayback-error | 748
+ gateway-timeout | 506
+ null-body | 191
+ spn2-cdx-lookup-failure | 99
+ petabox-error | 89
+ invalid-host-resolution | 70
+ spn2-error | 7
+ spn2-error:job-failed | 2
+ spn2-error:soft-time-limit-exceeded | 1
+ bad-gzip-encoding | 1
+ (19 rows)
+
+Where are no-capture results terminating? May need to add or update heritrix
+crawl config so that we get better yield without needing to do SPNv2 crawling.
+
+ SELECT initial_domain, terminal_domain, COUNT(*)
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS initial_domain,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS terminal_domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status = 'no-capture'
+ ) t1
+ GROUP BY initial_domain, terminal_domain
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ initial_domain | terminal_domain | count
+ ---------------------------------+---------------------+--------
+ www.researchgate.net | | 334145
+ academic.oup.com | | 205820
+ www.tandfonline.com | | 148638
+ journals.sagepub.com | | 144196
+ muse.jhu.edu | | 55957
+ hrcak.srce.hr | | 25317
+ www.omicsonline.org | | 22426
+ link.springer.com | | 21044
+ iopscience.iop.org | | 12385
+ bioone.org | | 9097
+ tandfonline.com | | 8512
+ or.nsfc.gov.cn | | 4823
+ ieeexplore.ieee.org | ieeexplore.ieee.org | 4398
+ pubs.acs.org | | 3708
+ archive-ouverte.unige.ch | | 2743
+ dergipark.ulakbim.gov.tr | | 2677
+ hal.archives-ouvertes.fr | | 1258
+ dergipark.org.tr | | 1207
+ apo.org.au | | 1186
+ spire.sciencespo.fr | | 989
+ cyberleninka.ru | | 895
+ lirias.kuleuven.be | | 855
+ tel.archives-ouvertes.fr | | 786
+ pub.uni-bielefeld.de | | 728
+ www.research-collection.ethz.ch | | 670
+ (25 rows)
+
+## Heritrix Seedlist Generation
+
+Dump ingest requests (filtered for some domains that don't expect to crawl via
+heritrix):
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status IS NULL)
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/mag_nocapture_20200708.rows.json';
+ => 8784683
+
+ # in sandcrawler pipenv
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200708.rows.json > /grande/snapshots/mag_nocapture_20200708.json
+
+Seedlist transform from here on covered in MAG crawl notes.
+
+## Bulk Ingest
+
+Run ingest requests on everything we crawled:
+
+ cat /grande/snapshots/mag_nocapture_20200708.json | | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Small sample:
+
+ head -n1000 /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full run:
+
+ cat /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Updated Overall Stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 24574294
+ redirect-loop | 2633731
+ no-capture | 2458694
+ no-pdf-link | 1896871
+ link-loop | 1510899
+ terminal-bad-status | 878821
+ cdx-error | 387574
+ gateway-timeout | 200246
+ | 170304
+ wayback-error | 97572
+ spn2-cdx-lookup-failure | 80284
+ wrong-mimetype | 65097
+ invalid-host-resolution | 37204
+ petabox-error | 12097
+ null-body | 8549
+ spn2-error | 1706
+ spn2-error:job-failed | 775
+ spn2-error:invalid-url-syntax | 335
+ spn2-error:soft-time-limit-exceeded | 191
+ bad-redirect | 90
+ (20 rows)
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 24557382
+ redirect-loop | 2630582
+ no-capture | 1947066
+ no-pdf-link | 1778206
+ link-loop | 1510790
+ terminal-bad-status | 857173
+ cdx-error | 384525
+ gateway-timeout | 200143
+ wayback-error | 96390
+ spn2-cdx-lookup-failure | 80010
+ wrong-mimetype | 64908
+ invalid-host-resolution | 37203
+ petabox-error | 12087
+ null-body | 8548
+ spn2-error | 1698
+ spn2-error:job-failed | 775
+ spn2-error:invalid-url-syntax | 335
+ spn2-error:soft-time-limit-exceeded | 191
+ bad-redirect | 90
+ | 69
+ (20 rows)
+
+Just the new seeds:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.created > '2020-06-20'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------------------+---------
+ success | 5860601
+ no-capture | 1489959
+ redirect-loop | 619121
+ no-pdf-link | 473703
+ terminal-bad-status | 234753
+ cdx-error | 231575
+ link-loop | 184093
+ wayback-error | 56068
+ wrong-mimetype | 14046
+ null-body | 2068
+ petabox-error | 1006
+ gateway-timeout | 506
+ spn2-cdx-lookup-failure | 99
+ invalid-host-resolution | 70
+ | 22
+ bad-redirect | 13
+ spn2-error | 7
+ timeout | 3
+ spn2-error:job-failed | 2
+ spn2-error:soft-time-limit-exceeded | 1
+ (20 rows)
+
diff --git a/notes/ingest/2020-08_daily_improvements.md b/notes/ingest/2020-08_daily_improvements.md
new file mode 100644
index 0000000..da57065
--- /dev/null
+++ b/notes/ingest/2020-08_daily_improvements.md
@@ -0,0 +1,202 @@
+
+Goal is to increase rate of successful daily changelog crawling, but reduce
+wasted attempts.
+
+Status by domain, past 30 days:
+
+ domain | status | count
+ --------------------------------------+-----------------+-------
+ arxiv.org | success | 21792
+ zenodo.org | success | 10646
+ res.mdpi.com | success | 10449
+ springernature.figshare.com | no-pdf-link | 10430
+ s3-eu-west-1.amazonaws.com | success | 8966
+ zenodo.org | no-pdf-link | 8137
+ hkvalidate.perfdrive.com | no-pdf-link | 5943
+ www.ams.org:80 | no-pdf-link | 5799
+ assets.researchsquare.com | success | 4651
+ pdf.sciencedirectassets.com | success | 4145
+ fjfsdata01prod.blob.core.windows.net | success | 3500
+ sage.figshare.com | no-pdf-link | 3174
+ onlinelibrary.wiley.com | no-pdf-link | 2869
+ www.e-periodica.ch | no-pdf-link | 2709
+ revistas.uned.es | success | 2631
+ figshare.com | no-pdf-link | 2500
+ www.sciencedirect.com | link-loop | 2477
+ linkinghub.elsevier.com | gateway-timeout | 1878
+ downloads.hindawi.com | success | 1819
+ www.scielo.br | success | 1691
+ jps.library.utoronto.ca | success | 1590
+ www.ams.org | no-pdf-link | 1568
+ digi.ub.uni-heidelberg.de | no-pdf-link | 1496
+ research-repository.griffith.edu.au | success | 1412
+ journals.plos.org | success | 1330
+ (25 rows)
+
+Status by DOI prefix, past 30 days:
+
+ doi_prefix | status | count
+ ------------+-------------------------+-------
+ 10.6084 | no-pdf-link | 14410 <- figshare; small fraction success
+ 10.6084 | success | 4007
+ 10.6084 | cdx-error | 1746
+
+ 10.13140 | gateway-timeout | 9689 <- researchgate
+ 10.13140 | cdx-error | 4154
+
+ 10.5281 | success | 9408 <- zenodo
+ 10.5281 | no-pdf-link | 6079
+ 10.5281 | cdx-error | 3200
+ 10.5281 | wayback-error | 2098
+
+ 10.1090 | no-pdf-link | 7420 <- AMS (ams.org)
+
+ 10.3390 | success | 6599 <- MDPI
+ 10.3390 | cdx-error | 3032
+ 10.3390 | wayback-error | 1636
+
+ 10.1088 | no-pdf-link | 3227 <- IOP science
+
+ 10.1101 | gateway-timeout | 3168 <- coldspring harbor: press, biorxiv, medrxiv, etc
+ 10.1101 | cdx-error | 1147
+
+ 10.21203 | success | 3124 <- researchsquare
+ 10.21203 | cdx-error | 1181
+
+ 10.1016 | success | 3083 <- elsevier
+ 10.1016 | cdx-error | 2465
+ 10.1016 | gateway-timeout | 1682
+ 10.1016 | wayback-error | 1567
+
+ 10.25384 | no-pdf-link | 3058 <- sage figshare
+ 10.25384 | success | 2456
+
+ 10.1007 | gateway-timeout | 2913 <- springer
+ 10.1007 | cdx-error | 1164
+
+ 10.5944 | success | 2831
+ 10.1186 | success | 2650
+ 10.5169 | no-pdf-link | 2644 <- www.e-periodica.ch
+ 10.3389 | success | 2279
+ 10.24411 | gateway-timeout | 2184 <- cyberleninka.ru
+ 10.1038 | gateway-timeout | 2143 <- nature group
+ 10.1177 | gateway-timeout | 2038 <- SAGE
+ 10.11588 | no-pdf-link | 1574 <- journals.ub.uni-heidelberg.de (OJS?)
+ 10.25904 | success | 1416
+ 10.1155 | success | 1304
+ 10.21994 | no-pdf-link | 1268 <- loar.kb.dk
+ 10.18720 | spn2-cdx-lookup-failure | 1232 <- elib.spbstu.ru
+ 10.24411 | cdx-error | 1202
+ 10.1055 | no-pdf-link | 1170 <- thieme-connect.de
+ (40 rows)
+
+code changes for ingest:
+x hkvalidate.perfdrive.com: just bail when we see this
+x skip large publishers which gateway-timeout (for now)
+ - springerlink (10.1007)
+ - nature group (10.1038)
+ - SAGE (10.1177)
+ - IOP (10.1088)
+
+fatcat:
+x figshare (by `doi_prefix`): if not versioned (suffix), skip crawl
+x zenodo: also try to not crawl if unversioned (group)
+x figshare import metadata
+
+sandcrawler:
+x ends with `cookieAbsent` or `cookieSet=1` -> status as cookie-blocked
+x https://profile.thieme.de/HTML/sso/ejournals/login.htm[...] => blocklist
+x verify that we do quick-get for arxiv.org + europmc.org (+ figshare/zenodo?)
+ => we were not!
+x shorten post-SPNv2 CDX pause? for throughput, given that we are re-trying anyways
+x ensure that we store uncrawled URL somewhere on no-capture status
+ => in HTML or last of hops
+ => not in DB, but that is a bigger change
+
+- try to get un-blocked:
+ - coldspring harbor has been blocking since 2020-06-22? yikes!
+ - cyberleninka.ru
+ - arxiv.org
+
+- no-pdf-link
+ x www.ams.org (10.1090)
+ => these seem to be stale captures, eg from 2008. newer captures have citation_pdf_url
+ => should consider recrawling all of ams.org?
+ => not sure why these crawl requests are happening only now
+ => on the order of 15k OA articles not in ia; 43k total not preserved
+ => force recrawl OA subset (DONE)
+ x www.e-periodica.ch (10.5169)
+ => TODO: dump un-preserved URLs, transform to PDF urls, heritrix crawl, re-ingest
+ x digi.ub.uni-heidelberg.de (10.11588)
+ => TODO: bulk re-enqueue? then heritrix crawl?
+ - https://loar.kb.dk/handle/1902/6988 (10.21994)
+ => TODO: bulk re-enqueue
+ => site was updated recently (august 2020); now it crawls fine. need to re-ingest all?
+ => 7433 hits
+ - thieme-connect.de (10.1055)
+ => 600k+ missing
+ => TODO: bulk re-enqueue? then heritrix crawl?
+ => https://profile.thieme.de/HTML/sso/ejournals/login.htm[...] => blocklist
+ => generally just need to re-crawl all?
+
+Unresolved:
+- why so many spn2-errors on https://elib.spbstu.ru/ (10.18720)?
+
+## figshare
+
+10.6084 regular figshare
+10.25384 SAGE figshare
+
+For sage, "collections" are bogus? can we detect these in datacite metadata?
+
+If figshare types like:
+
+ ris: "GEN",
+ bibtex: "misc",
+ citeproc: "article",
+ schemaOrg: "Collection",
+ resourceType: "Collection",
+ resourceTypeGeneral: "Collection"
+
+then mark as 'stub'.
+
+"Additional file" items don't seem like "stub"; -> "component".
+
+title:"Figure {} from " -> component
+
+current types are mostly: article, stub, dataset, graphic, article-journal
+
+If DOI starts with "sage.", then publisher is "Sage" (not figshare). Container
+name should be... sage.figshare.com?
+
+set version to the version from DOI
+
+## zenodo
+
+doi_prefix: 10.5281
+
+if on zenodo, and has a "Identical to" relation, then this is a pre-print. in
+that case, drop container_id and set container_name to zenodo.org. *But*, there
+are some journals now publishing exclusively to zenodo.org, so retain that
+metadata. examples:
+
+ "Detection of keyboard vibrations and effects on perceived piano quality"
+ https://fatcat.wiki/release/mufzkdgt2nbzfha44o7p7gkrpy
+
+ "Editing LAF: Educate, don't defend!"
+ https://zenodo.org/record/2583025
+
+version number not available in zenodo metadata
+
+## Gitlab MR Notes
+
+The main goal of this group of changes is to do a better job at daily ingest.
+
+Currently we have on the order of 20k new releases added to the index every day, and about half of them get are marked as OA (either CC license or via container being in DOAJ or ROAD), and pass some filters (eg, release_type), and are selected for ingest. Of those, about half fail to crawl to fulltext, either due to blocking (gateway-timeout, cookie tests, anti-bot detection, loginwall, etc). On the other hand, we don't attempt to crawl lots of "bronze" OA, which is content that is available from the publisher website, but isn't marked explicitly OA.
+
+Based on investigating daily crawling from the past month (will commit these notes to sandcrawler soon), I have identified some DOI prefixes that almost always fail ingest via SPNv2. I also have some patches to sandcrawler ingest to improve ability to crawl some large repositories etc.
+
+Some of the biggest "OA but failed to crawl" are from figshare and zenodo, which register a relatively large fraction of daily OA DOIs. We want to crawl most of that content, but both of these platforms register at least DOIs for each piece of content (a "group" DOI and a "versioned" DOI), and we only need to crawl one. There were also some changes needed to release-type filtering and assignment specific to these platforms, or based on the title of entities.
+
+This MR mixes changes to the datacite metadata import routing (including some refactors out of the main parse_record method) and behavior changes to the entity updater (which is where the code to decide about whether to send an ingest request on release creation lives). I will have a separate MR for importer metadata changes that don't impact ingest behavior.
+
diff --git a/notes/ingest/2020-09_oa_doi.md b/notes/ingest/2020-09_oa_doi.md
new file mode 100644
index 0000000..f5c853d
--- /dev/null
+++ b/notes/ingest/2020-09_oa_doi.md
@@ -0,0 +1,352 @@
+
+It seems that many gold OA DOIs on were not ingesting simply because the HTML
+url extraction was not working for a particular version of OJS.
+
+Let's re-try all ~2.5 million of these in bulk mode and see how many are
+'no-capture' vs. other errors, then possibly re-crawl a large number.
+
+## Bulk Ingest
+
+Dump ingest requests
+
+ ./fatcat_ingest.py query 'is_oa:true preservation:none !arxiv_id:* !pmcid:* !publisher_type:big5 type:article-journal' | pv -l > /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json
+ Expecting 2569876 release objects in search queries
+ Counter({'elasticsearch_release': 2569880, 'estimate': 2569880, 'ingest_request': 2063034})
+
+Enqueue
+
+ cat /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Started at about:
+
+ Thu Sep 17 00:15:00 UTC 2020
+ 2020-09-17T00:15:00Z
+
+## Stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ AND ingest_file_result.updated >= '2020-09-16'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ status | count
+ -------------------------------------+--------
+ no-capture | 513462
+ success | 206042
+ no-pdf-link | 186779
+ terminal-bad-status | 40372
+ redirect-loop | 33103
+ cdx-error | 24078
+ link-loop | 13494
+ spn2-cdx-lookup-failure | 10247
+ gateway-timeout | 4407
+ wrong-mimetype | 3213
+ petabox-error | 866
+ null-body | 449
+ spn2-error | 217
+ wayback-error | 129
+ spn2-error:job-failed | 64
+ bad-redirect | 6
+ spn2-error:soft-time-limit-exceeded | 1
+ (17 rows)
+
+This was only about half the requests. Try... broader?
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ingest_file_result.updated >= '2020-09-15'
+ AND ingest_file_result.updated <= '2020-09-20'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ status | count
+ -------------------------------------+--------
+ no-capture | 579952
+ success | 387325
+ no-pdf-link | 380406
+ terminal-bad-status | 63743
+ redirect-loop | 53893
+ cdx-error | 46024
+ spn2-cdx-lookup-failure | 28347
+ link-loop | 22573
+ gateway-timeout | 11686
+ wrong-mimetype | 6294
+ null-body | 3509
+ petabox-error | 2388
+ spn2-error | 1023
+ spn2-error:job-failed | 462
+ wayback-error | 347
+ spn2-error:soft-time-limit-exceeded | 20
+ bad-redirect | 11
+ (17 rows)
+
+What top domains for those `no-pdf-link` (or similar)?
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ingest_file_result.updated >= '2020-09-15'
+ AND ingest_file_result.updated <= '2020-09-20'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ------------------------------+-------------------------+-------
+ zenodo.org | no-pdf-link | 56488
+ figshare.com | no-pdf-link | 55337
+ www.egms.de | redirect-loop | 22686
+ zenodo.org | terminal-bad-status | 22128
+ tandf.figshare.com | no-pdf-link | 20027
+ springernature.figshare.com | no-pdf-link | 17181
+ cairn.info | terminal-bad-status | 13836
+ www.persee.fr | terminal-bad-status | 7565
+ projecteuclid.org | link-loop | 7449
+ www.cairn.info | no-pdf-link | 6992
+ scialert.net | no-pdf-link | 6621
+ www.cairn.info | link-loop | 5870
+ utpjournals.press | no-pdf-link | 5772
+ journals.openedition.org | redirect-loop | 5464
+ www.egms.de | no-pdf-link | 5223
+ archaeologydataservice.ac.uk | no-pdf-link | 4881
+ rs.figshare.com | no-pdf-link | 4773
+ www.degruyter.com | spn2-cdx-lookup-failure | 4763
+ koreascience.or.kr | no-pdf-link | 4487
+ cancerres.aacrjournals.org | no-pdf-link | 4124
+ cms.math.ca | no-pdf-link | 3441
+ volcano.si.edu | no-pdf-link | 3424
+ www.mathnet.ru | no-pdf-link | 3229
+ tidsskriftet.no | no-pdf-link | 3012
+ journals.plos.org | no-pdf-link | 3005
+ tudigit.ulb.tu-darmstadt.de | no-pdf-link | 2796
+ www.cairn.info:80 | link-loop | 2647
+ hammer.figshare.com | no-pdf-link | 2627
+ www.psychosocial.com | no-pdf-link | 2457
+ osf.io | terminal-bad-status | 2388
+ (30 rows)
+
+Should look at link extraction for:
+
+- scialert.net
+- utpjournals.press
+- koreascience.or.kr
+- cancerres.aacrjournals.org
+- cms.math.ca
+- volcano.si.edu
+- www.mathnet.ru
+- www.psychosocial.com
+
+## Re-Ingest
+
+Re-run ingest to handle `no-capture` cases, to extract the missing terminal URLs:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ingest_file_result.updated >= '2020-09-15'
+ AND ingest_file_result.updated <= '2020-09-20'
+ AND ingest_file_result.status = 'no-capture'
+ -- AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ ) TO '/grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json';
+ => COPY 579952
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json | pv -l | shuf > /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json
+ => 579k 0:00:22 [25.9k/s]
+
+ cat /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Resuming progress on this in early December 2020.
+
+Filtered requests to re-crawl:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ((ingest_file_result.updated >= '2020-09-15' AND ingest_file_result.updated <= '2020-09-20')
+ OR (ingest_file_result.updated >= '2020-10-11'))
+ AND ingest_file_result.status != 'success'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json';
+ => COPY 2352614
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | pv -l > /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json
+
+And actually dump seedlist(s):
+
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | jq -r .base_url | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt
+
+ wc -l /grande/snapshots/oa_doi_seedlist_2020-12-08.*.txt
+ 2352614 /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt
+ 481910 /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt
+
+Top DOI prefixes (same old usual suspects):
+
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg "://doi.org/" | cut -f4 -d/ | sort | uniq -c | sort -nr | head -n20
+ 353695 10.5281 zenodo.org
+ 121888 10.6084 figshare.org
+ 115093 10.3917 cairn.info
+ 113252 10.3406 persee.fr
+ 95414 10.1515 degruyter.com
+ 90448 10.4324 taylorfrancis.com
+ 83927 10.1016 elsevier
+ 60303 10.1109 IEEE
+ 48490 10.4000 openedition.org
+ 28498 10.3205 egms.de
+ 23433 10.1163 brill.com
+ 23276 10.17615 cdr.lib.unc.edu
+ 21386 10.1093 oup.com
+ 20783 10.3138 utpjournals.press
+ 19987 10.1201 tandfonline.com
+ 17916 10.34847 cocoon.huma-num.fr
+ 16970 10.1002 wiley.com
+ 15958 10.1097 lww.com (and others?)
+ 15835 10.1017 cambridge.org
+ 15466 10.24355 publikationsserver.tu-braunschweig.de (IR)
+
+Top domains (not doi.org):
+
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg -v "://doi.org/" | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+ 104148 zenodo.org
+ 85245 www.persee.fr
+ 52931 www.cairn.info
+ 4791 www.jstage.jst.go.jp
+ 4411 archive.monthlyreview.org
+ 4129 osf.io
+ 2841 www.indianjournals.com
+ 2746 www.impan.pl
+ 2620 platform.almanhal.com
+ 2019 www.nomos-elibrary.de
+ 1209 dergipark.org.tr
+ 1027 pubs.geoscienceworld.org
+ 973 www.pdcnet.org
+ 923 www.hanspub.org
+ 914 www.repository.cam.ac.uk
+ 863 mediarep.org
+ 812 www.cartographicperspectives.org
+ 687 www.degruyter.com
+ 578 192.168.7.24
+ 566 journals.eco-vector.com
+
+TODO: infer `publisher_type` and platform from DOI prefix in more cases
+
+## Re-Ingest
+
+Crawl has completed. Starting this bulk ingest on 2020-12-31; roughly 2.3
+million requests. Note these are all `pdf` requests, but crawl was done in an
+HTML-friendly way, so should be able to do domain/journal-specific HTML ingests
+in the future.
+
+ cat /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Stats, for this ingest period (fuzzy; will have some daily ingest stuff):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog')
+ AND ingest_file_result.updated >= '2020-12-28'
+ AND ingest_request.created <= '2020-12-09'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ status | count
+ -----------------------+--------
+ no-pdf-link | 962714
+ success | 539305
+ no-capture | 306590
+ redirect-loop | 192149
+ link-loop | 184797
+ terminal-bad-status | 141721
+ wrong-mimetype | 10362
+ null-body | 10277
+ skip-url-blocklist | 1985
+ wayback-content-error | 1300
+ cdx-error | 869
+ petabox-error | 160
+ bad-redirect | 72
+ wayback-error | 46
+ bad-gzip-encoding | 7
+ timeout | 1
+ max-hops-exceeded | 1
+ (17 rows)
+
diff --git a/notes/ingest/2020-09_reingest.md b/notes/ingest/2020-09_reingest.md
new file mode 100644
index 0000000..ec4e536
--- /dev/null
+++ b/notes/ingest/2020-09_reingest.md
@@ -0,0 +1,197 @@
+
+Goal: re-bulk-ingest some older existing crawls which hung on errors like
+`cdx-error` or `wayback-error`, indicating that ingest might actually succeed
+on retry.
+
+Sources:
+- unpaywall (again)
+- doi (ingest, changelog, etc)
+- mag
+- oai
+
+## DOI
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ status | count
+ -------------------------------------+---------
+ no-pdf-link | 8304582
+ success | 3461708
+ no-capture | 1881269
+ redirect-loop | 1851541
+ gateway-timeout | 355820
+ cdx-error | 341848
+ terminal-bad-status | 328650
+ skip-url-blocklist | 220474
+ spn2-cdx-lookup-failure | 125521
+ link-loop | 109352
+ wayback-error | 101525
+ null-body | 73539
+ wrong-mimetype | 53151
+ spn-error | 13579
+ spn2-error | 6848
+ spn2-error:job-failed | 4381
+ spn-remote-error | 4180
+ other-mimetype | 2305
+ petabox-error | 904
+ timeout | 710
+ spn2-error:soft-time-limit-exceeded | 557
+ spn2-error:proxy-error | 437
+ spn2-error:browser-running-error | 273
+ invalid-host-resolution | 233
+ pending | 116
+ (25 rows)
+
+Bulk:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/ingest_doi_errors_2020-09-03.rows.json';
+ => 443421
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_doi_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done
+
+Additional 27,779 success status? Hard to tell because lots of other ingest
+running in parallel.
+
+Live:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (
+ ingest_file_result.status = 'spn-error' OR
+ ingest_file_result.status = 'spn2-cdx-lookup-failure' OR
+ ingest_file_result.status = 'spn2-error:job-failed' OR
+ ingest_file_result.status = 'spn2-error:proxy-error'
+ )
+ ) TO '/grande/snapshots/ingest_doi_spn_errors_2020-09-03.rows.json';
+ => 143984
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_doi_spn_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_doi_spn_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## Unpaywall (again)
+
+Bulk:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/ingest_unpaywall_errors_2020-09-03.rows.json';
+ => 43912
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_unpaywall_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_unpaywall_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_unpaywall_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done
+
+## MAG
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/ingest_mag_errors_2020-09-03.rows.json';
+ => 188175
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_mag_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_mag_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_mag_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done
+
+## OAI-PMH
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND (
+ ingest_file_result.status = 'cdx-error' OR
+ ingest_file_result.status = 'wayback-error'
+ )
+ ) TO '/grande/snapshots/ingest_oai_errors_2020-09-03.rows.json';
+ => 851056
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_oai_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_oai_errors_2020-09-03.requests.json
+
+ cat /grande/snapshots/ingest_oai_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done
+
+---------
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json';
+
diff --git a/notes/ingest/2020-09_scielo.md b/notes/ingest/2020-09_scielo.md
new file mode 100644
index 0000000..4ec6fbd
--- /dev/null
+++ b/notes/ingest/2020-09_scielo.md
@@ -0,0 +1,21 @@
+
+As a follow-up to `SCIELO-CRAWL-2020-07`, going to bulk ingest all existing
+fatcat releases with no IA copy and with `publisher_type:scielo`. There are
+200k+ such releases.
+
+It seems like some of these are HTML or XML, eg: https://doi.org/10.4321/s1132-12962011000300008
+
+Could try XML ingest of these!
+
+## Bulk Ingest
+
+Dump ingest requests
+
+ ./fatcat_ingest.py --allow-non-oa query "publisher_type:scielo" | pv -l > /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json
+ Expecting 212529 release objects in search queries
+
+Enqueue
+
+ cat /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done 2020-09-14
+
diff --git a/notes/ingest/2020-10_daily.md b/notes/ingest/2020-10_daily.md
new file mode 100644
index 0000000..d2bb50b
--- /dev/null
+++ b/notes/ingest/2020-10_daily.md
@@ -0,0 +1,193 @@
+
+Quick notes on how daily ingest is going, circa September/October 2020.
+
+
+ SELECT ingest_request.ingest_type,
+ date(ingest_request.created),
+ COUNT(*) as total,
+ COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created)
+ ORDER BY date(ingest_request.created) DESC;
+
+ ingest_type | date | total | success
+ -------------+------------+-------+---------
+ pdf | 2020-10-10 | 6145 | 1368
+ pdf | 2020-10-09 | 28453 | 6461
+ pdf | 2020-10-08 | 15105 | 3803
+ pdf | 2020-10-07 | 34213 | 10813
+ pdf | 2020-10-06 | 22263 | 8565
+ pdf | 2020-10-05 | 7910 | 3200
+ pdf | 2020-10-04 | 10865 | 4579
+ pdf | 2020-10-03 | 27745 | 10818
+ pdf | 2020-10-02 | 34320 | 13523
+ pdf | 2020-10-01 | 32548 | 13252
+ pdf | 2020-09-30 | 34798 | 14113
+ pdf | 2020-09-29 | 22463 | 8328
+ pdf | 2020-09-28 | 4117 | 1278
+ pdf | 2020-09-27 | 5894 | 1732
+ pdf | 2020-09-26 | 34949 | 13901
+ pdf | 2020-09-25 | 33680 | 10605
+ pdf | 2020-09-24 | 15125 | 5785
+ pdf | 2020-09-23 | 20866 | 6584
+ pdf | 2020-09-22 | 20949 | 7167
+ pdf | 2020-09-21 | 22483 | 7308
+ pdf | 2020-09-20 | 45644 | 16981
+ pdf | 2020-09-19 | 95571 | 31991
+ pdf | 2020-09-18 | 50849 | 15875
+ pdf | 2020-09-17 | 20121 | 3158
+ pdf | 2020-09-16 | 39184 | 12150
+ pdf | 2020-09-15 | 16986 | 7705
+ (26 rows)
+
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+--------
+ pdf | success | 241047
+ pdf | no-pdf-link | 143084
+ pdf | spn2-cdx-lookup-failure | 108311
+ pdf | gateway-timeout | 97250
+ pdf | cdx-error | 61820
+ pdf | link-loop | 31350
+ pdf | wayback-error | 9139
+ pdf | spn2-error:job-failed | 4240
+ pdf | spn2-error | 3893
+ pdf | wrong-mimetype | 1010
+ pdf | no-capture | 851
+ pdf | null-body | 605
+ pdf | redirect-loop | 261
+ pdf | spn2-error:soft-time-limit-exceeded | 126
+ pdf | terminal-bad-status | 120
+ pdf | petabox-error | 105
+ pdf | timeout | 29
+ pdf | spn2-error:no-status | 2
+ pdf | spn2-error:invalid-server-response | 2
+ pdf | bad-gzip-encoding | 1
+ (20 rows)
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+
+ domain | status | count
+ ------------------------------+-------------------------+-------
+ zenodo.org | no-pdf-link | 52767
+ www.degruyter.com | link-loop | 17666
+ www.degruyter.com | spn2-cdx-lookup-failure | 17597
+ ieeexplore.ieee.org | gateway-timeout | 15290
+ www.sciencedirect.com | no-pdf-link | 14043
+ apps.crossref.org | no-pdf-link | 11531
+ figshare.com | no-pdf-link | 8966
+ tandf.figshare.com | no-pdf-link | 7276
+ zenodo.org | no-capture | 7191
+ springernature.figshare.com | no-pdf-link | 6485
+ www.taylorfrancis.com | link-loop | 6266
+ www.persee.fr | terminal-bad-status | 6031
+ journals.openedition.org | gateway-timeout | 5639
+ www.cairn.info | link-loop | 5618
+ archaeologydataservice.ac.uk | no-pdf-link | 5359
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 4748
+ www.e-periodica.ch | no-pdf-link | 4722
+ osf.io | no-capture | 4247
+ cancerres.aacrjournals.org | no-pdf-link | 4136
+ dlc.library.columbia.edu | no-pdf-link | 4085
+ www.egms.de | no-pdf-link | 3304
+ journals.lww.com | no-pdf-link | 3218
+ journals.plos.org | no-pdf-link | 3005
+ linkinghub.elsevier.com | gateway-timeout | 2833
+ www.egms.de | redirect-loop | 2606
+ (25 rows)
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ --------------------------------------+---------+-------
+ zenodo.org | success | 55549
+ arxiv.org | success | 24450
+ s3-eu-west-1.amazonaws.com | success | 18156
+ res.mdpi.com | success | 13493
+ www.degruyter.com | success | 12009
+ journals.openedition.org | success | 11235
+ www.jstage.jst.go.jp | success | 9460
+ peer.asee.org | success | 9416
+ www.e-periodica.ch | success | 8105
+ ir.canterbury.ac.nz | success | 6381
+ europepmc.org | success | 5670
+ www.repository.cam.ac.uk | success | 4858
+ assets.researchsquare.com | success | 4765
+ fjfsdata01prod.blob.core.windows.net | success | 4130
+ tidsskrift.dk | success | 3964
+ research-journal.org | success | 3127
+ ieeexplore.ieee.org | success | 2947
+ dergipark.org.tr | success | 2892
+ watermark.silverchair.com | success | 2315
+ journals.plos.org | success | 2304
+ journal.fi | success | 1996
+ publications.rwth-aachen.de | success | 1954
+ www.brazilianjournals.com | success | 1637
+ article.sciencepublishinggroup.com | success | 1589
+ revistas.upr.edu | success | 1467
+ (25 rows)
+
+Casual take-aways:
+- wonder what `apps.crossref.org` is
+- sciencedirect crawling broken?
+- figshare might be broken? or just very little success
+- seems like a lot of journals.plos.org failures
diff --git a/notes/ingest/2020-10_unpaywall.md b/notes/ingest/2020-10_unpaywall.md
new file mode 100644
index 0000000..a991025
--- /dev/null
+++ b/notes/ingest/2020-10_unpaywall.md
@@ -0,0 +1,286 @@
+
+New snapshot released 2020-10-09. Want to do a mostly straight-forward
+load/ingest/crawl.
+
+Proposed changes this time around:
+
+- have bulk ingest store missing URLs in a new sandcrawler-db for `no-capture`
+ status, and to include those URLs in heritrix3 crawl
+- tweak heritrix3 config for additional PDF URL extraction patterns,
+ particularly to improve OJS yield
+
+
+## Transform and Load
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/unpaywall/unpaywall_snapshot_2020-10-09T153852.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-10-09.ingest_request.json
+ => 28.3M 3:19:03 [2.37k/s]
+
+ cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => 28.3M 1:11:29 [ 6.6k/s]
+ => Worker: Counter({'total': 28298500, 'insert-requests': 4119939, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 28298500, 'pushed': 28298500})
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- AND date(ingest_request.created) > '2020-10-09'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/grande/snapshots/unpaywall_noingest_2020-10-09.rows.json';
+ => COPY 4216339
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-10-09.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json
+ => 4.22M 0:02:48 [ 25k/s]
+
+Start small, to test no-capture behavior:
+
+ cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | head -n1000 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+`no-capture` change looks good. Enqueue the whole batch:
+
+ cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+----------
+ success | 23661282
+ no-capture | 3015447
+ no-pdf-link | 2302102
+ redirect-loop | 1542566
+ terminal-bad-status | 1044676
+ wrong-mimetype | 114315
+ link-loop | 36358
+ cdx-error | 20150
+ null-body | 14513
+ wayback-error | 13644
+ gateway-timeout | 3776
+ spn2-cdx-lookup-failure | 1260
+ petabox-error | 1171
+ redirects-exceeded | 752
+ invalid-host-resolution | 464
+ spn2-error | 147
+ bad-redirect | 131
+ spn2-error:job-failed | 91
+ wayback-content-error | 45
+ timeout | 19
+ (20 rows)
+
+## Dump Seedlist
+
+Dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ ) t1
+ ) TO '/grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json';
+ => 2,936,404
+
+ # TODO: in the future also exclude "www.archive.org"
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | pv -l > /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json
+
+And actually dump seedlist(s):
+
+ cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.url.txt
+ cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.terminal_url.txt
+ cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.no_terminal_url.txt
+
+ wc -l unpaywall_seedlist_2020-11-02.*.txt
+ 2701178 unpaywall_seedlist_2020-11-02.terminal_url.txt
+ 2713866 unpaywall_seedlist_2020-11-02.url.txt
+
+With things like jsessionid, suspect that crawling just the terminal URLs is
+going to work better than both full and terminal.
+
+Finding a fraction of `no-capture` which have partial/stub URLs as terminal.
+
+TODO: investigate scale of partial/stub `terminal_url` (eg, not HTTP/S or FTP).
+
+
+## Bulk Ingest and Status
+
+Note, removing archive.org links:
+
+ cat /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json | rg -v www.archive.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Overall status (checked 2020-12-08):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 25004559
+ no-pdf-link | 2531841
+ redirect-loop | 1671375
+ terminal-bad-status | 1389463
+ no-capture | 893880
+ wrong-mimetype | 119332
+ link-loop | 66508
+ wayback-content-error | 30339
+ cdx-error | 21790
+ null-body | 20710
+ wayback-error | 13976
+ gateway-timeout | 3775
+ petabox-error | 2420
+ spn2-cdx-lookup-failure | 1218
+ redirects-exceeded | 889
+ invalid-host-resolution | 464
+ bad-redirect | 147
+ spn2-error | 112
+ spn2-error:job-failed | 91
+ timeout | 21
+ (20 rows)
+
+Ingest stats broken down by publication stage:
+
+ SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY release_stage, status
+ ORDER BY release_stage, COUNT DESC
+ LIMIT 100;
+
+
+ release_stage | status | count
+ ---------------+-------------------------------------+----------
+ accepted | success | 1101090
+ accepted | no-pdf-link | 28590
+ accepted | redirect-loop | 10923
+ accepted | no-capture | 9540
+ accepted | terminal-bad-status | 6339
+ accepted | cdx-error | 952
+ accepted | wrong-mimetype | 447
+ accepted | link-loop | 275
+ accepted | wayback-error | 202
+ accepted | petabox-error | 177
+ accepted | redirects-exceeded | 122
+ accepted | null-body | 27
+ accepted | wayback-content-error | 14
+ accepted | spn2-cdx-lookup-failure | 5
+ accepted | gateway-timeout | 4
+ accepted | bad-redirect | 1
+ published | success | 18595278
+ published | no-pdf-link | 2434935
+ published | redirect-loop | 1364110
+ published | terminal-bad-status | 1185328
+ published | no-capture | 718792
+ published | wrong-mimetype | 112923
+ published | link-loop | 63874
+ published | wayback-content-error | 30268
+ published | cdx-error | 17302
+ published | null-body | 15209
+ published | wayback-error | 10782
+ published | gateway-timeout | 1966
+ published | petabox-error | 1611
+ published | spn2-cdx-lookup-failure | 879
+ published | redirects-exceeded | 760
+ published | invalid-host-resolution | 453
+ published | bad-redirect | 115
+ published | spn2-error:job-failed | 77
+ published | spn2-error | 75
+ published | timeout | 21
+ published | bad-gzip-encoding | 5
+ published | spn2-error:soft-time-limit-exceeded | 4
+ published | spn2-error:pending | 1
+ published | blocked-cookie | 1
+ published | | 1
+ published | pending | 1
+ submitted | success | 5308166
+ submitted | redirect-loop | 296322
+ submitted | terminal-bad-status | 197785
+ submitted | no-capture | 165545
+ submitted | no-pdf-link | 68274
+ submitted | wrong-mimetype | 5962
+ submitted | null-body | 5474
+ submitted | cdx-error | 3536
+ submitted | wayback-error | 2992
+ submitted | link-loop | 2359
+ submitted | gateway-timeout | 1805
+ submitted | petabox-error | 632
+ submitted | spn2-cdx-lookup-failure | 334
+ submitted | wayback-content-error | 57
+ submitted | spn2-error | 37
+ submitted | bad-redirect | 31
+ submitted | spn2-error:job-failed | 14
+ submitted | | 12
+ submitted | invalid-host-resolution | 11
+ submitted | redirects-exceeded | 7
+ submitted | spn2-error:soft-time-limit-exceeded | 5
+ submitted | bad-gzip-encoding | 1
+ submitted | skip-url-blocklist | 1
+ | no-pdf-link | 42
+ | success | 25
+ | redirect-loop | 20
+ | terminal-bad-status | 11
+ | no-capture | 3
+ (70 rows)
diff --git a/notes/ingest/2020-11-04_arxiv.md b/notes/ingest/2020-11-04_arxiv.md
new file mode 100644
index 0000000..f9abe09
--- /dev/null
+++ b/notes/ingest/2020-11-04_arxiv.md
@@ -0,0 +1,12 @@
+
+Ran a bulk dump using fatcat ingest tool several months ago, and had Martin run
+a crawl.
+
+Crawl is now done, so going to ingest, hoping to get the majority of the
+millions of remaining arxiv.org PDFs.
+
+ zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | wc -l
+ => 1,288,559
+
+ zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2020-11_doaj.md b/notes/ingest/2020-11_doaj.md
new file mode 100644
index 0000000..473dd0d
--- /dev/null
+++ b/notes/ingest/2020-11_doaj.md
@@ -0,0 +1,295 @@
+
+This is the first ingest (and crawl) of URLs from DOAJ article-level metadata.
+It will include at least 'pdf' and 'html' ingest requests, not just 'pdf' as in
+the past.
+
+Working off a 2020-11-13 snapshot.
+
+## Transform and Load
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json
+ => 6.7M 0:24:28 [4.57k/s]
+
+ cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => ran in to error with blank `base_url`
+
+Second try after patches:
+
+ zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json
+ => 6.7M 0:24:29 [4.56k/s]
+
+ cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 6703036, 'insert-requests': 163854, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 6703036, 'pushed': 6703036})
+
+## Check Pre-Crawl Status
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+-------------------------+---------
+ pdf | | 3711532
+ html | | 2429003
+ pdf | success | 454403
+ pdf | redirect-loop | 48587
+ pdf | no-pdf-link | 24901
+ pdf | no-capture | 11569
+ xml | | 9442
+ pdf | link-loop | 8466
+ pdf | terminal-bad-status | 2015
+ pdf | wrong-mimetype | 1441
+ pdf | null-body | 1057
+ pdf | petabox-error | 299
+ pdf | cdx-error | 124
+ pdf | gateway-timeout | 114
+ pdf | wayback-error | 77
+ pdf | spn2-cdx-lookup-failure | 20
+ pdf | invalid-host-resolution | 4
+ pdf | spn2-error | 1
+ (18 rows)
+
+## Dump new URLs, Transform, Bulk Ingest (PDF and XML only)
+
+Dump:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'xml')
+ AND ingest_request.link_source = 'doaj'
+ -- AND date(ingest_request.created) > '2020-12-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/grande/snapshots/doaj_noingest_2020-11-19.rows.json';
+ => COPY 3732543
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_noingest_2020-11-19.rows.json | pv -l | shuf > /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json
+ => 3.73M 0:02:18 [26.9k/s]
+
+Definitely some non-URL strings in there; should try to filter those out
+earlier in the transform process. And/or have a constraint on the URL column in
+the database.
+
+Enqueue the whole batch:
+
+ cat /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Started this batch off at 2020-11-19 18:10 (Pacific time)
+
+Stats after run:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ ORDER BY ingest_request.ingest_type, COUNT DESC
+ LIMIT 30;
+
+## Dump Seedlist
+
+After preliminary bulk ingest attempts, dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ AND (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'xml')
+ AND ingest_file_result.status != 'success'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/grande/snapshots/doaj_seedlist_2020-11-19.rows.json';
+ => 1,899,555
+
+TODO: filter for valid URLs
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | pv -l > /grande/snapshots/doaj_crawl_ingest_2020-11-19.json
+
+And actually dump seedlist(s):
+
+ cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.url.txt
+ cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.terminal_url.txt
+ cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.no_terminal_url.txt
+
+ wc -l doaj_seedlist_2020-11-19.*.txt
+
+## Post-Crawl Ingest
+
+Re-run all ingests, from original batch (pdf, xml, and html), now that DOAJ
+identifiers are all in fatcat:
+
+ cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ # started 2020-12-23 15:05 (Pacific)
+ # finished around 2020-12-31, after one long/slow partition
+
+Stats again after everything:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ ORDER BY ingest_request.ingest_type, COUNT DESC
+ LIMIT 50;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ html | wrong-scope | 1089423
+ html | no-capture | 423917
+ html | redirect-loop | 212910
+ html | unknown-scope | 204069
+ html | html-resource-no-capture | 165587
+ html | success | 122937
+ html | null-body | 100296
+ html | wayback-content-error | 53918
+ html | wrong-mimetype | 18908
+ html | terminal-bad-status | 14059
+ html | petabox-error | 13520
+ html | cdx-error | 6823
+ html | wayback-error | 890
+ html | | 620
+ html | blocked-cookie | 543
+ html | blocked-captcha | 250
+ html | redirects-exceeded | 135
+ html | too-many-resources | 111
+ html | max-hops-exceeded | 84
+ html | bad-redirect | 3
+ pdf | success | 2851324
+ pdf | no-pdf-link | 529914
+ pdf | redirect-loop | 349494
+ pdf | no-capture | 272202
+ pdf | null-body | 129027
+ pdf | terminal-bad-status | 91796
+ pdf | link-loop | 25267
+ pdf | wrong-mimetype | 6504
+ pdf | wayback-error | 2968
+ pdf | | 2068
+ pdf | wayback-content-error | 1548
+ pdf | cdx-error | 1095
+ pdf | petabox-error | 1024
+ pdf | bad-redirect | 203
+ pdf | redirects-exceeded | 135
+ pdf | timeout | 20
+ pdf | max-hops-exceeded | 19
+ pdf | bad-gzip-encoding | 2
+ xml | success | 6897
+ xml | null-body | 2353
+ xml | wrong-mimetype | 184
+ xml | no-capture | 5
+ xml | cdx-error | 3
+ (43 rows)
+
+
+And on filtered subset that we actually crawled:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ AND (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'xml')
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ GROUP BY ingest_request.ingest_type, status
+ ORDER BY ingest_request.ingest_type, COUNT DESC
+ LIMIT 50;
+
+ ingest_type | status | count
+ -------------+-----------------------+---------
+ pdf | success | 2851286
+ pdf | no-pdf-link | 527495
+ pdf | redirect-loop | 345138
+ pdf | no-capture | 268140
+ pdf | null-body | 129027
+ pdf | terminal-bad-status | 91125
+ pdf | link-loop | 25267
+ pdf | wrong-mimetype | 6504
+ pdf | wayback-error | 2907
+ pdf | petabox-error | 363
+ pdf | wayback-content-error | 242
+ pdf | bad-redirect | 203
+ pdf | redirects-exceeded | 135
+ pdf | max-hops-exceeded | 19
+ pdf | cdx-error | 15
+ pdf | bad-gzip-encoding | 2
+ xml | success | 6897
+ xml | null-body | 2353
+ xml | wrong-mimetype | 184
+ xml | no-capture | 5
+ (20 rows)
+
diff --git a/notes/ingest/2020-12-08_patch_crawl_notes.md b/notes/ingest/2020-12-08_patch_crawl_notes.md
new file mode 100644
index 0000000..5979753
--- /dev/null
+++ b/notes/ingest/2020-12-08_patch_crawl_notes.md
@@ -0,0 +1,111 @@
+
+Notes here about re-ingesting or re-crawling large batches. Goal around end of
+2020 is to generate a broad patch crawl of terminal no-capture attempts for all
+major sources crawled thus far. Have already tried run this process for unpaywall.
+
+For each, want filtered ingest request JSON objects (filtering out platforms
+that don't crawl well, and possibly things like figshare+zenodo), and a broader
+seedlist (including terminal URLs). Will de-dupe all the seedlist URLs and do a
+heritrix crawl with new config, then re-ingest all the requests individually.
+
+Summary of what to do here:
+
+ OA DOI: expecting some 2.4 million seeds
+ OAI-PMH: expecting some 5 million no-capture URLs, plus more from missing PDF URL not found
+ Unpaywall: another ~900k no-capture URLs (maybe filtered?)
+
+For all, re-attempt for these status codes:
+
+ no-capture
+ cdx-error
+ wayback-error
+ petabox-error
+ gateway-timeout (?)
+
+And at least do bulk re-ingest for these, if updated before 2020-11-20 or so:
+
+ no-pdf-link
+
+## OAI-PMH
+
+Need to re-ingest all of the (many!) no-capture and no-pdf-link
+
+TODO: repec-specific URL extraction?
+
+Skip these OAI prefixes:
+
+ kb.dk
+ bnf.fr
+ hispana.mcu.es
+ bdr.oai.bsb-muenchen.de
+ ukm.si
+ hsp.org
+
+Skip these domains:
+
+ www.kb.dk (kb.dk)
+ kb-images.kb.dk (kb.dk)
+ mdz-nbn-resolving.de (TODO: what prefix?)
+ aggr.ukm.um.si (ukm.si)
+
+Check PDF link extraction for these prefixes, or skip them (TODO):
+
+ repec (mixed success)
+ biodiversitylibrary.org
+ juser.fz-juelich.de
+ americanae.aecid.es
+ www.irgrid.ac.cn
+ hal
+ espace.library.uq.edu.au
+ igi.indrastra.com
+ invenio.nusl.cz
+ hypotheses.org
+ t2r2.star.titech.ac.jp
+ quod.lib.umich.edu
+
+ domain: hemerotecadigital.bne.es
+ domain: bib-pubdb1.desy.de
+ domain: publikationen.bibliothek.kit.edu
+ domain: edoc.mpg.de
+ domain: bibliotecadigital.jcyl.es
+ domain: lup.lub.lu.se
+ domain: orbi.uliege.be
+
+TODO:
+- consider deleting ingest requests from skipped prefixes (large database use)
+
+
+## Unpaywall
+
+About 900k `no-pdf-link`, and up to 2.5 million more `no-pdf-link`.
+
+Re-bulk-ingest filtered requests which hit `no-pdf-link` before 2020-11-20:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) < '2020-11-20'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ ) TO '/grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json';
+ => COPY 1309990
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_nopdflink_2020-12-08.ingest_request.json
+ => 1.31M 0:00:51 [25.6k/s]
+
+ cat /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2021-04_unpaywall.md b/notes/ingest/2021-04_unpaywall.md
new file mode 100644
index 0000000..d7643f4
--- /dev/null
+++ b/notes/ingest/2021-04_unpaywall.md
@@ -0,0 +1,368 @@
+
+New snapshot released 2021-02-18, finally getting around to a crawl two months
+later.
+
+Intend to do same style of crawl as in the past. One change is that
+sandcrawler-db has moved to a focal VM.
+
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18T160139.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json
+ => 30.0M 3:14:59 [2.57k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 30027007, 'insert-requests': 2703999, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 30027007, 'pushed': 30027007})
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json';
+ => COPY 3277484
+
+ # previous, 2020-10 run: COPY 4216339
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json
+ => 3.28M 0:01:42 [32.1k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+----------
+ success | 26385866
+ no-pdf-link | 2132565
+ no-capture | 2092111
+ redirect-loop | 1732543
+ terminal-bad-status | 1504555
+ wayback-content-error | 357345
+ wrong-mimetype | 126070
+ link-loop | 76808
+ cdx-error | 22756
+ null-body | 22066
+ wayback-error | 13768
+ gateway-timeout | 3804
+ petabox-error | 3608
+ spn2-cdx-lookup-failure | 1225
+ redirects-exceeded | 892
+ invalid-host-resolution | 505
+ bad-redirect | 151
+ spn2-error | 108
+ spn2-error:job-failed | 91
+ bad-gzip-encoding | 27
+ (20 rows)
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-01-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 1348623
+ no-capture | 1231582
+ redirect-loop | 45622
+ no-pdf-link | 37312
+ terminal-bad-status | 24162
+ wrong-mimetype | 6684
+ link-loop | 5757
+ null-body | 1288
+ wayback-content-error | 1123
+ cdx-error | 831
+ petabox-error | 697
+ wayback-error | 185
+ invalid-host-resolution | 41
+ gateway-timeout | 29
+ blocked-cookie | 22
+ bad-gzip-encoding | 20
+ spn2-cdx-lookup-failure | 7
+ bad-redirect | 4
+ timeout | 3
+ redirects-exceeded | 3
+ (20 rows)
+
+## Dump Seedlist
+
+Dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json';
+ => 2020-10: 2,936,404
+ => 2021-04: 1,805,192
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json
+ => 1.81M 0:01:27 [20.6k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.*.txt
+ 6 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt
+ 1668524 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt
+ 1685717 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt
+
+## Post-Crawl Bulk Ingest
+
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => 1,804,211 consumer group lag
+
+## Post-Ingest Stats
+
+Overall status (unpaywall, all time):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 27242251
+ no-pdf-link | 2746237
+ redirect-loop | 1821132
+ terminal-bad-status | 1553441
+ no-capture | 478559
+ wayback-content-error | 357390
+ wrong-mimetype | 127365
+ link-loop | 79389
+ cdx-error | 23170
+ null-body | 23169
+ wayback-error | 13704
+ gateway-timeout | 3803
+ petabox-error | 3642
+ redirects-exceeded | 1427
+ spn2-cdx-lookup-failure | 1214
+ invalid-host-resolution | 505
+ bad-redirect | 153
+ spn2-error | 107
+ spn2-error:job-failed | 91
+ body-too-large | 84
+ (20 rows)
+
+Ingest stats broken down by publication stage:
+
+ SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY release_stage, status
+ ORDER BY release_stage, COUNT DESC
+ LIMIT 100;
+
+ release_stage | status | count
+ ---------------+-------------------------------------+----------
+ accepted | success | 1213335
+ accepted | no-pdf-link | 29292
+ accepted | redirect-loop | 12769
+ accepted | terminal-bad-status | 11264
+ accepted | no-capture | 10187
+ accepted | cdx-error | 1015
+ accepted | wayback-content-error | 757
+ accepted | wrong-mimetype | 501
+ accepted | link-loop | 407
+ accepted | wayback-error | 207
+ accepted | petabox-error | 189
+ accepted | redirects-exceeded | 125
+ accepted | null-body | 34
+ accepted | spn2-cdx-lookup-failure | 5
+ accepted | gateway-timeout | 4
+ accepted | blocked-cookie | 2
+ accepted | bad-redirect | 1
+ accepted | body-too-large | 1
+ published | success | 20196774
+ published | no-pdf-link | 2647969
+ published | redirect-loop | 1477558
+ published | terminal-bad-status | 1320013
+ published | wayback-content-error | 351931
+ published | no-capture | 297603
+ published | wrong-mimetype | 115440
+ published | link-loop | 76431
+ published | cdx-error | 18125
+ published | null-body | 17559
+ published | wayback-error | 10466
+ published | petabox-error | 2684
+ published | gateway-timeout | 1979
+ published | redirects-exceeded | 947
+ published | spn2-cdx-lookup-failure | 877
+ published | invalid-host-resolution | 457
+ published | bad-redirect | 120
+ published | spn2-error:job-failed | 77
+ published | spn2-error | 70
+ published | body-too-large | 39
+ published | bad-gzip-encoding | 24
+ published | timeout | 24
+ published | blocked-cookie | 23
+ published | spn2-error:soft-time-limit-exceeded | 4
+ published | | 2
+ published | pending | 1
+ published | spn2-error:pending | 1
+ published | too-many-redirects | 1
+ submitted | success | 5832117
+ submitted | redirect-loop | 330785
+ submitted | terminal-bad-status | 222152
+ submitted | no-capture | 170766
+ submitted | no-pdf-link | 68934
+ submitted | wrong-mimetype | 11424
+ submitted | null-body | 5576
+ submitted | wayback-content-error | 4702
+ submitted | cdx-error | 4030
+ submitted | wayback-error | 3031
+ submitted | link-loop | 2551
+ submitted | gateway-timeout | 1820
+ submitted | petabox-error | 769
+ submitted | redirects-exceeded | 355
+ submitted | spn2-cdx-lookup-failure | 332
+ submitted | invalid-host-resolution | 48
+ submitted | body-too-large | 44
+ submitted | spn2-error | 37
+ submitted | bad-redirect | 32
+ submitted | spn2-error:job-failed | 14
+ submitted | | 13
+ submitted | spn2-error:soft-time-limit-exceeded | 5
+ submitted | timeout | 4
+ submitted | bad-gzip-encoding | 3
+ submitted | skip-url-blocklist | 1
+ | no-pdf-link | 42
+ | success | 25
+ | redirect-loop | 20
+ | terminal-bad-status | 12
+ | no-capture | 3
+ (76 rows)
+
+
+Only the recent updates:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 2192376
+ no-capture | 152183
+ no-pdf-link | 144174
+ redirect-loop | 125988
+ terminal-bad-status | 67307
+ link-loop | 8292
+ wrong-mimetype | 7942
+ null-body | 2270
+ cdx-error | 1223
+ wayback-content-error | 1147
+ petabox-error | 728
+ wayback-error | 155
+ body-too-large | 82
+ invalid-host-resolution | 41
+ gateway-timeout | 28
+ blocked-cookie | 22
+ bad-gzip-encoding | 20
+ timeout | 7
+ bad-redirect | 6
+ redirects-exceeded | 4
+ (20 rows)
+
+In total, this iteration of unpaywall ingest resulted in:
+
+- 2,703,999 raw ingest requests (new URLs total)
+- 1,231,582 (45.5%) of these had not been seen/crawled from any source yet
+- 843,753 (31.2%) success from new heritrix crawling
+- 2,192,376 (81.1%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success)
diff --git a/notes/ingest/2021-05_daily_improvements.md b/notes/ingest/2021-05_daily_improvements.md
new file mode 100644
index 0000000..e8748fa
--- /dev/null
+++ b/notes/ingest/2021-05_daily_improvements.md
@@ -0,0 +1,480 @@
+
+Summary of top large broken domains (2021-04-21 "30 day" snapshot):
+
+## acervus.unicamp.br
+
+ domain | status | count
+---------------------------------------+-------------------------+--------
+ acervus.unicamp.br | | 1967
+ acervus.unicamp.br | no-pdf-link | 1853
+
+select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%acervus.unicamp.br%' and status = 'no-pdf-link' limit 5;
+
+http://acervus.unicamp.br/index.asp?codigo_sophia=963332
+
+seems like many of these were captures with a blank page? or a redirect to
+the homepage?
+
+http://web.archive.org/web/20200129110523/http://acervus.unicamp.br/index.html
+
+messy, going to move on.
+
+
+## apex.ipk-gatersleben.de
+
+apex.ipk-gatersleben.de | | 1253
+apex.ipk-gatersleben.de | no-pdf-link | 1132
+
+select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%apex.ipk-gatersleben.de%' and status = 'no-pdf-link' limit 5;
+
+https://doi.org/10.25642/ipk/rescoll/4886
+https://apex.ipk-gatersleben.de/apex/f?p=PGRDOI:RESOLVE:::NO:RP:DOI:10.25642/IPK/RESCOLL/7331
+
+seem to be datasets/species, not articles.
+
+prefix: 10.25642/ipk
+
+## crossref.org
+
+ apps.crossref.org | | 4693
+ apps.crossref.org | no-pdf-link | 4075
+
+https://doi.org/10.1515/9781501747045-013
+https://apps.crossref.org/coaccess/coaccess.html?doi=10.1515%2F9781501747045-013
+
+Derp, they are doing a dynamic/AJAX thing, so access links are not in the HTML.
+
+## openeditiong
+
+ books.openedition.org | | 1784
+ books.openedition.org | no-pdf-link | 1466
+
+https://doi.org/10.4000/books.pul.34492
+https://books.openedition.org/pul/34492
+
+these are not actually OA books (or at least, not all are)
+
+## chemrxiv.org (figshare)
+
+ chemrxiv.org | | 857
+ chemrxiv.org | no-pdf-link | 519
+
+https://doi.org/10.26434/chemrxiv.14411081
+https://chemrxiv.org/articles/preprint/Prediction_and_Optimization_of_Ion_Transport_Characteristics_in_Nanoparticle-Based_Electrolytes_Using_Convolutional_Neural_Networks/14411081
+
+these all seem to be *multi-file* entities, thus not good for single file ingest pipeline.
+
+## direct.mit.edu
+
+ direct.mit.edu | | 996
+ direct.mit.edu | no-pdf-link | 869
+
+https://doi.org/10.7551/mitpress/14056.003.0004
+https://direct.mit.edu/books/monograph/5111/chapter-abstract/3060134/Adding-Technology-to-Contact-Tracing?redirectedFrom=fulltext
+
+"not available"
+
+https://doi.org/10.7551/mitpress/12444.003.0004
+
+"not available"
+
+
+## dlc.library.columbia.edu
+
+ dlc.library.columbia.edu | | 4225
+ dlc.library.columbia.edu | no-pdf-link | 2395
+ dlc.library.columbia.edu | spn2-wayback-error | 1568
+
+https://doi.org/10.7916/d8-506w-kk49
+https://dlc.library.columbia.edu/durst/cul:18931zcrk9
+
+document repository.
+this one goes to IA! actually many seem to.
+added extractor, should re-ingest with:
+
+ publisher:"Columbia University" doi_prefix:10.7916 !journal:*
+
+actually, that is like 600k+ results and many are not digitized, so perhaps not.
+
+## doi.ala.org.au
+
+ doi.ala.org.au | | 2570
+ doi.ala.org.au | no-pdf-link | 2153
+
+https://doi.org/10.26197/ala.811d55e3-2ff4-4501-b3e7-e19249507052
+https://doi.ala.org.au/doi/811d55e3-2ff4-4501-b3e7-e19249507052
+
+this is a data repository, with filesets, not papers. datacite metadata is
+incorrect.
+
+## fldeploc.dep.state.fl.us
+
+ fldeploc.dep.state.fl.us | | 774
+ fldeploc.dep.state.fl.us | no-pdf-link | 718
+
+
+https://doi.org/10.35256/ic29
+http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29
+
+re-ingest with:
+
+ # only ~800 works
+ doi_prefix:10.35256 publisher:Florida
+
+## geoscan.nrcan.gc.ca
+
+ geoscan.nrcan.gc.ca | | 2056
+ geoscan.nrcan.gc.ca | no-pdf-link | 2019
+
+https://doi.org/10.4095/295366
+https://geoscan.nrcan.gc.ca/starweb/geoscan/servlet.starweb?path=geoscan/fulle.web&search1=R=295366
+
+this is a geographic repository, not papers.
+
+## kiss.kstudy.com
+
+ kiss.kstudy.com | | 747
+ kiss.kstudy.com | no-pdf-link | 686
+
+https://doi.org/10.22143/hss21.12.1.121
+http://kiss.kstudy.com/thesis/thesis-view.asp?key=3862523
+
+Korean. seems to not actually be theses? can't download.
+
+## linkinghub.elsevier.com
+
+ linkinghub.elsevier.com | | 5079
+ linkinghub.elsevier.com | forbidden | 2226
+ linkinghub.elsevier.com | spn2-wayback-error | 1625
+ linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758
+
+skipping for now, looks like mostly 'forbidden'?
+
+## osf.io
+
+These are important!
+
+ osf.io | | 3139
+ osf.io | not-found | 2288
+ osf.io | spn2-wayback-error | 582
+
+https://doi.org/10.31219/osf.io/jux3w
+https://accounts.osf.io/login?service=https://osf.io/jux3w/download
+
+many of these are 404s by browser as well. what does that mean?
+
+## peerj.com
+
+ peerj.com | | 785
+ peerj.com | no-pdf-link | 552
+
+https://doi.org/10.7287/peerj.11155v0.1/reviews/2
+https://peerj.com/articles/11155/reviews/
+
+these are HTML reviews, not papers
+
+## preprints.jmir.org
+
+ preprints.jmir.org | | 763
+ preprints.jmir.org | no-pdf-link | 611
+
+https://doi.org/10.2196/preprints.22556
+https://preprints.jmir.org/preprint/22556
+
+UGH, looks simple, but javascript.
+
+could try to re-write URL into S3 format? meh.
+
+## psyarxiv.com (OSF?)
+
+ psyarxiv.com | | 641
+ psyarxiv.com | no-pdf-link | 546
+
+https://doi.org/10.31234/osf.io/5jaqg
+https://psyarxiv.com/5jaqg/
+
+Also infuriatingly Javascript, but can do URL hack.
+
+Should reingest, and potentially force-recrawl:
+
+ # about 67k
+ publisher:"Center for Open Science" in_ia:false
+
+## publons.com
+
+ publons.com | | 6998
+ publons.com | no-pdf-link | 6982
+
+https://doi.org/10.1002/jmor.21338/v2/review1
+https://publons.com/publon/40260824/
+
+These are just HTML reviews, not papers.
+
+## saemobilus.sae.org
+
+ saemobilus.sae.org | | 795
+ saemobilus.sae.org | no-pdf-link | 669
+
+https://doi.org/10.4271/as1426c
+https://saemobilus.sae.org/content/as1426c
+
+These seem to be standards, and are not open access (paywall)
+
+## scholar.dkyobobook.co.kr
+
+ scholar.dkyobobook.co.kr | | 1043
+ scholar.dkyobobook.co.kr | no-pdf-link | 915
+
+https://doi.org/10.22471/crisis.2021.6.1.18
+http://scholar.dkyobobook.co.kr/searchDetail.laf?barcode=4010028199536
+
+Korean. complex javascript, skipping.
+
+## unreserved.rba.gov.au
+
+ unreserved.rba.gov.au | | 823
+ unreserved.rba.gov.au | no-pdf-link | 821
+
+https://doi.org/10.47688/rba_archives_2006/04129
+https://unreserved.rba.gov.au/users/login
+
+Don't need to login when I tried in browser? document repo, not papers.
+
+## wayf.switch.ch
+
+ wayf.switch.ch | | 1169
+ wayf.switch.ch | no-pdf-link | 809
+
+https://doi.org/10.24451/arbor.11128
+https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Farbor.bfh.ch%2Fshibboleth&return=https%3A%2F%2Farbor.bfh.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A5056fc0a97aeab16e5007ca63bede254cb5669d94173064d6c74c62a0f88b022
+
+Loginwall
+
+##
+
+ www.bloomsburycollections.com | | 1745
+ www.bloomsburycollections.com | no-pdf-link | 1571
+
+https://doi.org/10.5040/9781849664264.0008
+https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries
+
+These are primarily not OA/available.
+
+##
+
+ www.emc2020.eu | | 791
+ www.emc2020.eu | no-pdf-link | 748
+
+https://doi.org/10.22443/rms.emc2020.146
+https://www.emc2020.eu/abstract/evaluation-of-different-rectangular-scan-strategies-for-hrstem-imaging.html
+
+These are just abstracts, not papers.
+
+## Emerald
+
+ www.emerald.com | | 2420
+ www.emerald.com | no-pdf-link | 1986
+
+https://doi.org/10.1108/ramj-11-2020-0065
+https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html
+
+Note that these URLs are already HTML fulltext. but the PDF is also available and easy.
+
+re-ingest:
+
+ # only ~3k or so missing
+ doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true
+
+##
+
+ www.humankineticslibrary.com | | 1122
+ www.humankineticslibrary.com | no-pdf-link | 985
+
+https://doi.org/10.5040/9781718206625.ch-002
+https://www.humankineticslibrary.com/encyclopedia-chapter?docid=b-9781718206625&tocid=b-9781718206625-chapter2
+
+paywall
+
+##
+
+ www.inderscience.com | | 1532
+ www.inderscience.com | no-pdf-link | 1217
+
+https://doi.org/10.1504/ijdmb.2020.10036342
+https://www.inderscience.com/info/ingeneral/forthcoming.php?jcode=ijdmb
+
+paywall
+
+##
+
+ www.ingentaconnect.com | | 885
+ www.ingentaconnect.com | no-pdf-link | 783
+
+https://doi.org/10.15258/sst.2021.49.1.07
+https://www.ingentaconnect.com/content/ista/sst/pre-prints/content-7_sst.2021.49.1_63-71;jsessionid=1joc5mmi1juht.x-ic-live-02
+
+Annoying javascript, but easy to work around.
+
+re-ingest:
+
+ # only a couple hundred; also re-ingest
+ doi_prefix:10.15258 in_ia:false year:>2018
+
+##
+
+ www.nomos-elibrary.de | | 2235
+ www.nomos-elibrary.de | no-pdf-link | 1128
+ www.nomos-elibrary.de | spn2-wayback-error | 559
+
+https://doi.org/10.5771/9783748907084-439
+https://www.nomos-elibrary.de/10.5771/9783748907084-439/verzeichnis-der-autorinnen-und-autoren
+
+Javascript obfuscated download button?
+
+##
+
+ www.oecd-ilibrary.org | | 3046
+ www.oecd-ilibrary.org | no-pdf-link | 2869
+
+https://doi.org/10.1787/543e84ed-en
+https://www.oecd-ilibrary.org/development/applying-evaluation-criteria-thoughtfully_543e84ed-en
+
+Paywall.
+
+##
+
+ www.osapublishing.org | | 821
+ www.osapublishing.org | no-pdf-link | 615
+
+https://doi.org/10.1364/boe.422199
+https://www.osapublishing.org/boe/abstract.cfm?doi=10.1364/BOE.422199
+
+Some of these are "pre-registered" DOIs, not published yet. Many of the
+remaining are actually HTML articles, and/or have some stuff in the
+`citation_pdf_url`. A core problem is captchas.
+
+Have started adding support to fatcat for HTML crawl type based on container.
+
+re-ingest:
+
+ container_twtpsm6ytje3nhuqfu3pa7ca7u (optica)
+ container_cg4vcsfty5dfvgmat5wm62wgie (optics express)
+
+##
+
+ www.oxfordscholarlyeditions.com | | 759
+ www.oxfordscholarlyeditions.com | no-pdf-link | 719
+
+https://doi.org/10.1093/oseo/instance.00266789
+https://www.oxfordscholarlyeditions.com/view/10.1093/actrade/9780199593668.book.1/actrade-9780199593668-div1-27
+
+loginwall/paywall
+
+##
+
+ www.schweizerbart.de | | 730
+ www.schweizerbart.de | no-pdf-link | 653
+
+https://doi.org/10.1127/zfg/40/1996/461
+https://www.schweizerbart.de/papers/zfg/detail/40/97757/Theoretical_model_of_surface_karstic_processes?af=crossref
+
+paywall
+
+##
+
+ www.sciencedirect.com | | 14757
+ www.sciencedirect.com | no-pdf-link | 12733
+ www.sciencedirect.com | spn2-wayback-error | 1503
+
+https://doi.org/10.1016/j.landurbplan.2021.104104
+https://www.sciencedirect.com/science/article/pii/S0169204621000670
+
+Bunch of crazy new hacks, but seems to be working!
+
+re-ingest:
+
+ # to start! about 50k
+ doi_prefix:10.1016 is_oa:true year:2021
+
+##
+
+ www.sciendo.com | | 1955
+ www.sciendo.com | no-pdf-link | 1176
+
+https://doi.org/10.2478/awutm-2019-0012
+https://www.sciendo.com/article/10.2478/awutm-2019-0012
+
+uses lots of javascript, hard to scrape.
+
+
+## Others (for reference)
+
+ | | 725990
+ | no-pdf-link | 209933
+ | success | 206134
+ | spn2-wayback-error | 127015
+ | spn2-cdx-lookup-failure | 53384
+ | blocked-cookie | 35867
+ | link-loop | 25834
+ | too-many-redirects | 16430
+ | redirect-loop | 14648
+ | forbidden | 13794
+ | terminal-bad-status | 8055
+ | not-found | 6399
+ | remote-server-error | 2402
+ | wrong-mimetype | 2011
+ | spn2-error:unauthorized | 912
+ | bad-redirect | 555
+ | read-timeout | 530
+
+## Re-ingests
+
+All the above combined:
+
+ container_twtpsm6ytje3nhuqfu3pa7ca7u (optica)
+ container_cg4vcsfty5dfvgmat5wm62wgie (optics express)
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id twtpsm6ytje3nhuqfu3pa7ca7u
+ => Counter({'ingest_request': 1142, 'elasticsearch_release': 1142, 'estimate': 1142, 'kafka': 1142})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id cg4vcsfty5dfvgmat5wm62wgie
+ => Counter({'elasticsearch_release': 33482, 'estimate': 33482, 'ingest_request': 32864, 'kafka': 32864})
+
+ # only ~800 works
+ doi_prefix:10.35256 publisher:Florida
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query "doi_prefix:10.35256 publisher:Florida"
+ => Counter({'ingest_request': 843, 'elasticsearch_release': 843, 'estimate': 843, 'kafka': 843})
+
+ # only ~3k or so missing
+ doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1108 publisher:emerald"
+ => Counter({'ingest_request': 3812, 'elasticsearch_release': 3812, 'estimate': 3812, 'kafka': 3812})
+
+
+ # only a couple hundred; also re-ingest
+ doi_prefix:10.15258 in_ia:false year:>2018
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa --force-recrawl query "doi_prefix:10.15258 year:>2018"
+ => Counter({'ingest_request': 140, 'elasticsearch_release': 140, 'estimate': 140, 'kafka': 140})
+
+ # to start! about 50k
+ doi_prefix:10.1016 is_oa:true year:2020
+ doi_prefix:10.1016 is_oa:true year:2021
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2020"
+ => Counter({'ingest_request': 75936, 'elasticsearch_release': 75936, 'estimate': 75936, 'kafka': 75936})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2021"
+ => Counter({'ingest_request': 54824, 'elasticsearch_release': 54824, 'estimate': 54824, 'kafka': 54824})
+
+ pmcid:* year:2018
+ pmcid:* year:2019
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2018"
+ => Counter({'ingest_request': 25366, 'elasticsearch_release': 25366, 'estimate': 25366, 'kafka': 25366})
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2019"
+ => Counter({'ingest_request': 55658, 'elasticsearch_release': 55658, 'estimate': 55658, 'kafka': 55658})
+
diff --git a/notes/ingest/2021-07_unpaywall.md b/notes/ingest/2021-07_unpaywall.md
new file mode 100644
index 0000000..8b6ac09
--- /dev/null
+++ b/notes/ingest/2021-07_unpaywall.md
@@ -0,0 +1,320 @@
+
+New snapshot released 2021-07-02. Should be "boring" ingest and crawl.
+
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02T151134.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json
+ => 32.2M 3:01:52 [2.95k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 32196260, 'insert-requests': 3325954, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 32196260, 'pushed': 32196260})
+
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json';
+ => COPY 3556146
+
+ # previous, 2020-10 run: COPY 4216339
+ # previous, 2021-07 run: COPY 3277484
+
+Oops, should have run instead, with the date filter:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json';
+
+But didn't, so processed all instead.
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json
+ => 3.56M 0:01:59 [29.8k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => done, on 2021-07-13
+
+
+## Check Pre-Crawl Status
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 1831827
+ success | 1343604
+ redirect-loop | 103999
+ terminal-bad-status | 19845
+ no-pdf-link | 17448
+ link-loop | 5027
+ wrong-mimetype | 2270
+ cdx-error | 523
+ body-too-large | 321
+ null-body | 298
+ wayback-content-error | 242
+ petabox-error | 155
+ gateway-timeout | 138
+ invalid-host-resolution | 120
+ wayback-error | 109
+ blocked-cookie | 9
+ timeout | 7
+ | 3
+ bad-redirect | 3
+ spn2-cdx-lookup-failure | 3
+ (20 rows)
+
+
+## Dump Seedlist
+
+Dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND date(ingest_request.created) > '2021-07-01'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json';
+ => COPY 1743186
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json
+ => 1.74M 0:01:33 [18.6k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.*.txt
+ 1 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt
+ 1643963 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt
+ 1644028 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt
+ 3287992 total
+
+Then run crawl (see `journal-crawls` git repo).
+
+## Post-Crawl Bulk Ingest
+
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => 1.74M 0:01:59 [14.6k/s]
+
+## Post-Ingest Stats
+
+Only the recent updates:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 2690258
+ redirect-loop | 227328
+ no-capture | 157368
+ terminal-bad-status | 118943
+ no-pdf-link | 92698
+ blocked-cookie | 19478
+ link-loop | 9249
+ wrong-mimetype | 4918
+ cdx-error | 1786
+ wayback-error | 1497
+ null-body | 1302
+ body-too-large | 433
+ wayback-content-error | 245
+ petabox-error | 171
+ gateway-timeout | 138
+ invalid-host-resolution | 120
+ timeout | 12
+ bad-redirect | 4
+ | 3
+ spn2-cdx-lookup-failure | 1
+ (20 rows)
+
+Only the recent updates, by publication stage:
+
+ SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ GROUP BY release_stage, status
+ ORDER BY release_stage, COUNT DESC
+ LIMIT 100;
+
+ release_stage | status | count
+ ---------------+-------------------------+---------
+ accepted | success | 103144
+ accepted | no-pdf-link | 53981
+ accepted | terminal-bad-status | 4102
+ accepted | link-loop | 2799
+ accepted | no-capture | 2315
+ accepted | redirect-loop | 2171
+ accepted | blocked-cookie | 234
+ accepted | cdx-error | 140
+ accepted | wayback-error | 101
+ accepted | wrong-mimetype | 38
+ accepted | null-body | 10
+ accepted | petabox-error | 5
+ accepted | wayback-content-error | 4
+ accepted | gateway-timeout | 2
+ accepted | body-too-large | 2
+ published | success | 1919100
+ published | no-capture | 130104
+ published | redirect-loop | 127482
+ published | terminal-bad-status | 43118
+ published | no-pdf-link | 33505
+ published | blocked-cookie | 19034
+ published | link-loop | 6241
+ published | wrong-mimetype | 4163
+ published | null-body | 1195
+ published | cdx-error | 1151
+ published | wayback-error | 1105
+ published | wayback-content-error | 197
+ published | body-too-large | 195
+ published | petabox-error | 118
+ published | gateway-timeout | 35
+ published | invalid-host-resolution | 13
+ published | timeout | 8
+ published | bad-redirect | 2
+ published | spn2-cdx-lookup-failure | 1
+ published | bad-gzip-encoding | 1
+ submitted | success | 668014
+ submitted | redirect-loop | 97675
+ submitted | terminal-bad-status | 71723
+ submitted | no-capture | 24949
+ submitted | no-pdf-link | 5212
+ submitted | wrong-mimetype | 717
+ submitted | cdx-error | 495
+ submitted | wayback-error | 291
+ submitted | body-too-large | 236
+ submitted | blocked-cookie | 210
+ submitted | link-loop | 209
+ submitted | invalid-host-resolution | 107
+ submitted | gateway-timeout | 101
+ submitted | null-body | 97
+ submitted | petabox-error | 48
+ submitted | wayback-content-error | 44
+ submitted | timeout | 4
+ submitted | | 3
+ submitted | bad-redirect | 2
+ submitted | remote-server-error | 1
+ (55 rows)
+
+In total, this iteration of unpaywall ingest resulted in:
+
+- 3,325,954 raw ingest requests (new URLs total)
+- 1,743,186 (52% of all) of these had not been seen/crawled from any source yet (?), and attempted to crawl
+- 1,346,654 (77% of crawled) success from new heritrix crawling
+- 2,690,258 (80%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success)
+
+## Live Ingest Follow-Up
+
+Will run SPN requests on the ~160k `no-capture` URLs:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-07-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json';
+ => COPY 157371
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json
+ => 157k 0:00:04 [31.6k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ => DONE
diff --git a/notes/ingest/2021-08_mag.md b/notes/ingest/2021-08_mag.md
new file mode 100644
index 0000000..5f92196
--- /dev/null
+++ b/notes/ingest/2021-08_mag.md
@@ -0,0 +1,400 @@
+
+Using 2021-06-07 upstream MAG snapshot to run a crawl and do some re-ingest.
+Also want to re-ingest some old/failed ingests, now that pipeline/code has
+improved.
+
+Ran munging from `scratch:ingest/mag` notes first. Yielded 22.5M PDF URLs.
+
+
+## Persist Ingest Requests
+
+ zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | head -n1000 | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 1000, 'insert-requests': 276, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 1000, 'pushed': 1000})
+
+ zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | pv -l | ./persist_tool.py ingest-request -
+ => 22.5M 0:46:00 [8.16k/s]
+ => Worker: Counter({'total': 22527585, 'insert-requests': 8686315, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 22527585, 'pushed': 22527585})
+
+Roughly 8.6 million new URLs
+
+## Pre-Crawl Status Counts
+
+Status of combined old and new requests, with some large domains removed:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ -- AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------+----------
+ success | 26123975
+ | 6664846
+ no-pdf-link | 1859908
+ redirect-loop | 1532405
+ no-capture | 1199126
+ link-loop | 1157010
+ terminal-bad-status | 832362
+ gateway-timeout | 202158
+ spn2-cdx-lookup-failure | 81406
+ wrong-mimetype | 69087
+ invalid-host-resolution | 37262
+ wayback-error | 21340
+ petabox-error | 11237
+ null-body | 9414
+ wayback-content-error | 2199
+ cdx-error | 1893
+ spn2-error | 1741
+ spn2-error:job-failed | 971
+ blocked-cookie | 902
+ spn2-error:invalid-url-syntax | 336
+ (20 rows)
+
+And just the new URLs (note that domain filter shouldn't be required, but
+keeping for consistency):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ | 6664780
+ success | 1957844
+ redirect-loop | 23357
+ terminal-bad-status | 9385
+ no-pdf-link | 8315
+ no-capture | 6892
+ link-loop | 4517
+ wrong-mimetype | 3864
+ cdx-error | 1749
+ blocked-cookie | 842
+ null-body | 747
+ wayback-error | 688
+ wayback-content-error | 570
+ gateway-timeout | 367
+ petabox-error | 340
+ spn2-cdx-lookup-failure | 150
+ read-timeout | 122
+ not-found | 119
+ invalid-host-resolution | 63
+ spn2-error | 23
+ (20 rows)
+
+## Dump Initial Bulk Ingest Requests
+
+Note that this is all-time, not just recent, and will re-process a lot of
+"no-pdf-link":
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-pdf-link'
+ OR ingest_file_result.status = 'cdx-error'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json';
+ => COPY 8526647
+
+Transform to ingest requests:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json
+ => 8.53M 0:03:40
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
+Updated stats after running initial bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 5184994
+ no-capture | 3284416
+ redirect-loop | 98685
+ terminal-bad-status | 28733
+ link-loop | 28518
+ blocked-cookie | 22338
+ no-pdf-link | 19073
+ wrong-mimetype | 9122
+ null-body | 2793
+ wayback-error | 2128
+ wayback-content-error | 1233
+ cdx-error | 1198
+ petabox-error | 617
+ gateway-timeout | 395
+ not-found | 130
+ read-timeout | 128
+ | 111
+ invalid-host-resolution | 63
+ spn2-cdx-lookup-failure | 24
+ spn2-error | 20
+ (20 rows)
+
+## Generate Seedlist
+
+For crawling, do a similar (but not identical) dump:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json';
+ => COPY 4599519
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | pv -l > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json
+ => 4.60M 0:02:55 [26.2k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt
+ cat /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt
+ => DONE
+
+ wc -l /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.*.txt
+ 4593238 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt
+ 4632911 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt
+ 3294710 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt
+
+## Post-Crawl Bulk Re-Ingest
+
+Got about 1.8 million new PDFs from crawl, and a sizable fraction of dupes (by
+hash, URL agnostic).
+
+Enqueue for buik re-ingest:
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => Thu 19 Aug 2021 09:10:59 PM UTC
+
+
+## Post-Ingest Stats
+
+Just the new stuff (compare against above for delta):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 7748241 89.2%
+ no-capture | 429688 4.9%
+ redirect-loop | 172831 2.0%
+ terminal-bad-status | 94029 1.1%
+ no-pdf-link | 86437 1.0%
+ blocked-cookie | 67903 0.8%
+ link-loop | 50622
+ wrong-mimetype | 21064
+ null-body | 6650
+ cdx-error | 3313
+ wayback-error | 2630
+ gateway-timeout | 399
+ petabox-error | 268
+ wayback-content-error | 170
+ not-found | 130
+ read-timeout | 128
+ | 109
+ invalid-host-resolution | 63
+ bad-redirect | 39
+ spn2-error | 20
+ (20 rows)
+
+New success due to crawl (new batch only): 7748241 - 1957844 = 5,790,397
+
+Overall success of new batch: 7748241. / 8686315 = 89.2%
+
+And combined (old and new) status again:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ -- AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 31990062
+ redirect-loop | 1704717
+ no-capture | 1263462
+ link-loop | 1218280
+ blocked-cookie | 1213838
+ no-pdf-link | 1096664
+ terminal-bad-status | 960070
+ gateway-timeout | 202190
+ wrong-mimetype | 86557
+ invalid-host-resolution | 37262
+ null-body | 15443
+ wayback-error | 12839
+ cdx-error | 4047
+ spn2-error | 1731
+ spn2-error:job-failed | 962
+ petabox-error | 463
+ wayback-content-error | 379
+ spn2-error:invalid-url-syntax | 336
+ spn2-error:soft-time-limit-exceeded | 203
+ | 175
+ (20 rows)
+
+New success total: 31990062 - 26123975 = 5,866,087
+
+A full 1,263,462 no-capture that could be attempted... though many of those may
+be excluded for a specific reason.
diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md
new file mode 100644
index 0000000..ac808dd
--- /dev/null
+++ b/notes/ingest/2021-09-02_oai_pmh_patch.md
@@ -0,0 +1,1578 @@
+
+Just a "patch" of previous OAI-PMH crawl/ingest: re-ingesting and potentially
+re-crawling content which failed to ingest the first time.
+
+May fold this in with more general patch crawling.
+
+## Basic Counts
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -----------------------+----------
+ success | 14145387
+ no-pdf-link | 12063022
+ no-capture | 5485640
+ redirect-loop | 2092705
+ terminal-bad-status | 747372
+ wrong-mimetype | 597219
+ link-loop | 542144
+ null-body | 93566
+ cdx-error | 19798
+ petabox-error | 17943
+ | 15283
+ wayback-error | 13897
+ gateway-timeout | 511
+ skip-url-blocklist | 184
+ wayback-content-error | 146
+ bad-redirect | 137
+ redirects-exceeded | 120
+ bad-gzip-encoding | 116
+ timeout | 80
+ blocked-cookie | 64
+ (20 rows)
+
+ SELECT
+ oai_prefix,
+ COUNT(CASE WHEN status = 'success' THEN 1 END) as success,
+ COUNT(*) as total
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ ) t1
+ GROUP BY oai_prefix
+ ORDER BY total DESC
+ LIMIT 40;
+
+
+ oai_prefix | success | total
+ ---------------------------+---------+---------
+ repec | 1133175 | 2783448
+ hal | 573218 | 1049607
+ www.irgrid.ac.cn | 18007 | 748828
+ cds.cern.ch | 74078 | 688091
+ americanae.aecid.es | 71310 | 572792
+ juser.fz-juelich.de | 23026 | 518551
+ espace.library.uq.edu.au | 6649 | 508960
+ igi.indrastra.com | 59629 | 478577
+ archive.ugent.be | 65306 | 424014
+ hrcak.srce.hr | 404085 | 414897
+ zir.nsk.hr | 156753 | 397200
+ renati.sunedu.gob.pe | 79362 | 388355
+ hypotheses.org | 3 | 374296
+ rour.neicon.ru | 7997 | 354529
+ generic.eprints.org | 263566 | 340470
+ invenio.nusl.cz | 6340 | 325867
+ evastar-karlsruhe.de | 62282 | 317952
+ quod.lib.umich.edu | 5 | 309135
+ diva.org | 67917 | 298348
+ t2r2.star.titech.ac.jp | 1085 | 289388
+ edpsciences.org | 139495 | 284972
+ repository.ust.hk | 10245 | 283417
+ revues.org | 151156 | 277497
+ pure.atira.dk | 13492 | 260754
+ bibliotecadigital.jcyl.es | 50606 | 254134
+ escholarship.org/ark | 140835 | 245203
+ ojs.pkp.sfu.ca | 168029 | 229387
+ lup.lub.lu.se | 49358 | 226602
+ library.wur.nl | 15051 | 216738
+ digitalrepository.unm.edu | 111704 | 211749
+ infoscience.tind.io | 60166 | 207299
+ edoc.mpg.de | 0 | 205252
+ erudit.org | 168490 | 197803
+ delibra.bg.polsl.pl | 38666 | 196652
+ n/a | 0 | 193814
+ aleph.bib-bvb.de | 4349 | 186666
+ serval.unil.ch | 41643 | 186372
+ orbi.ulg.ac.be | 2400 | 184551
+ digitalcommons.unl.edu | 144025 | 184372
+ bib-pubdb1.desy.de | 33525 | 182717
+ (40 rows)
+
+Top counts by OAI prefix and status:
+
+ SELECT
+ oai_prefix,
+ status,
+ COUNT((oai_prefix,status))
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ ) t1
+ GROUP BY oai_prefix, status
+ ORDER BY COUNT DESC
+ LIMIT 50;
+
+ oai_prefix | status | count
+ ---------------------------+---------------+---------
+ repec | success | 1133175
+ repec | no-pdf-link | 638105
+ hal | success | 573218
+ cds.cern.ch | no-capture | 540380
+ repec | redirect-loop | 516451
+ juser.fz-juelich.de | no-pdf-link | 477881
+ americanae.aecid.es | no-pdf-link | 417766
+ hrcak.srce.hr | success | 404085
+ www.irgrid.ac.cn | no-pdf-link | 370908
+ hal | no-pdf-link | 359252
+ www.irgrid.ac.cn | no-capture | 355532
+ espace.library.uq.edu.au | no-pdf-link | 320479
+ igi.indrastra.com | no-pdf-link | 318242
+ repec | no-capture | 316981
+ invenio.nusl.cz | no-pdf-link | 309802
+ rour.neicon.ru | redirect-loop | 300911
+ hypotheses.org | no-pdf-link | 300251
+ renati.sunedu.gob.pe | no-capture | 282800
+ t2r2.star.titech.ac.jp | no-pdf-link | 272045
+ generic.eprints.org | success | 263566
+ quod.lib.umich.edu | no-pdf-link | 259661
+ archive.ugent.be | no-capture | 256127
+ evastar-karlsruhe.de | no-pdf-link | 248939
+ zir.nsk.hr | link-loop | 226919
+ repository.ust.hk | no-pdf-link | 208569
+ edoc.mpg.de | no-pdf-link | 199758
+ bibliotecadigital.jcyl.es | no-pdf-link | 188433
+ orbi.ulg.ac.be | no-pdf-link | 172373
+ diva.org | no-capture | 171115
+ lup.lub.lu.se | no-pdf-link | 168652
+ erudit.org | success | 168490
+ ojs.pkp.sfu.ca | success | 168029
+ lib.dr.iastate.edu | success | 158494
+ zir.nsk.hr | success | 156753
+ digital.kenyon.edu | success | 154900
+ revues.org | success | 151156
+ books.openedition.org | no-pdf-link | 149607
+ freidok.uni-freiburg.de | no-pdf-link | 146837
+ digitalcommons.unl.edu | success | 144025
+ escholarship.org/ark | success | 140835
+ culeuclid | link-loop | 140291
+ edpsciences.org | success | 139495
+ serval.unil.ch | no-pdf-link | 138644
+ bib-pubdb1.desy.de | no-pdf-link | 133815
+ krm.or.kr | no-pdf-link | 132461
+ pure.atira.dk | no-pdf-link | 132179
+ oai-gms.dimdi.de | redirect-loop | 131409
+ aleph.bib-bvb.de | no-capture | 128261
+ library.wur.nl | no-pdf-link | 124718
+ lirias2repo.kuleuven.be | no-capture | 123106
+ (50 rows)
+
+Note: could just delete the "excluded" rows? and not harvest them in the
+future, and filter them at ingest time (in transform script).
+
+
+
+## Investigate no-pdf-link sandcrawler improvements
+
+Do some spot-sampling of 'no-pdf-link' domains, see if newer sandcrawler works:
+
+ SELECT
+ ingest_request.link_source_id AS oai_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.link_source_id LIKE 'oai:library.wur.nl:%'
+ ORDER BY random()
+ LIMIT 10;
+
+Random sampling of *all* 'no-pdf-link' URLs (see if newer sandcrawler works):
+
+ \x auto
+
+ SELECT
+ ingest_request.link_source_id AS oai_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_file_result.status = 'no-pdf-link'
+ ORDER BY random()
+ LIMIT 30;
+
+### repec (SKIP-PREFIX)
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351
+base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115
+base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92
+base_url | http://pz.wz.uw.edu.pl/en
+terminal_url | http://pz.wz.uw.edu.pl:80/en
+-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351
+base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115
+base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412
+-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92
+base_url | http://pz.wz.uw.edu.pl/en
+terminal_url | http://pz.wz.uw.edu.pl:80/en
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:erv:rccsrc:y:2016:i:2016_11:35
+base_url | http://www.eumed.net/rev/caribe/2016/11/estructura.html
+terminal_url | http://www.eumed.net:80/rev/caribe/2016/11/estructura.html
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:pio:envira:v:33:y:2001:i:4:p:629-647
+base_url | http://www.envplan.com/epa/fulltext/a33/a3319.pdf
+terminal_url | http://uk.sagepub.com:80/en-gb/eur/pion-journals-published
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:tpr:qjecon:v:100:y:1985:i:3:p:651-75
+base_url | http://links.jstor.org/sici?sici=0033-5533%28198508%29100%3A3%3C651%3ATCOCEA%3E2.0.CO%3B2-2&origin=repec
+terminal_url | https://www.jstor.org/stable/1884373
+
+Huh! This is just a catalog of other domains. Should probably skip
+
+DONE: skip/filter repec
+
+### juser.fz-juelich.de (SCOPE)
+
+-[ RECORD 1 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:132217
+base_url | http://juser.fz-juelich.de/record/132217
+terminal_url | http://juser.fz-juelich.de/record/132217
+
+Poster; no files.
+
+-[ RECORD 2 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:268598
+base_url | http://juser.fz-juelich.de/record/268598
+terminal_url | http://juser.fz-juelich.de/record/268598
+
+Journal.
+
+-[ RECORD 3 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:126613
+base_url | http://juser.fz-juelich.de/record/126613
+terminal_url | http://juser.fz-juelich.de/record/126613
+
+-[ RECORD 4 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:67362
+base_url | http://juser.fz-juelich.de/record/67362
+terminal_url | http://juser.fz-juelich.de/record/67362
+-[ RECORD 5 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:869189
+base_url | http://juser.fz-juelich.de/record/869189
+terminal_url | http://juser.fz-juelich.de/record/869189
+-[ RECORD 6 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:810746
+base_url | http://juser.fz-juelich.de/record/810746
+terminal_url | http://juser.fz-juelich.de/record/810746
+-[ RECORD 7 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:52897
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22
+-[ RECORD 8 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:114755
+base_url | http://juser.fz-juelich.de/record/114755
+terminal_url | http://juser.fz-juelich.de/record/114755
+-[ RECORD 9 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:58025
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22
+
+The search URLs seem redundant? Not going to try to handle those.
+
+"Powered by Invenio v1.1.7"
+
+All of these examples seem to be not papers. Maybe we can filter these better
+at the harvest or transform stage?
+
+### americanae.aecid.es (MIXED)
+
+-[ RECORD 1 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:502896
+base_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai
+terminal_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai
+
+just a metadata record? links to redalyc
+
+METADATA-ONLY
+
+-[ RECORD 2 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:534600
+base_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1
+terminal_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1
+-[ RECORD 3 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:524567
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567
+
+NOT-FOUND (404)
+
+-[ RECORD 4 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:378914
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914
+
+Some single-page image archival thing? bespoke, skipping.
+
+SKIP-BESPOKE
+
+-[ RECORD 5 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:526142
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142
+
+NOT-FOUND (404)
+
+-[ RECORD 6 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:373408
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408
+
+NOT-FOUND (404)
+
+### www.irgrid.ac.cn (SKIP-PREFIX)
+
+Chinese Academy of Sciences Institutional Repositories Grid
+
+-[ RECORD 1 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1749980
+base_url | http://www.irgrid.ac.cn/handle/1471x/1749980
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1749980
+
+Can't access
+
+FORBIDDEN
+
+-[ RECORD 2 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/857397
+base_url | http://www.irgrid.ac.cn/handle/1471x/857397
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/857397
+
+Just linking to another IR; skip it.
+
+http://ir.ipe.ac.cn/handle/122111/10608
+
+requires login
+
+DONE: '/password-login;jsessionid' as a loginwall URL pattern
+ http://ir.ipe.ac.cn/handle/122111/10608
+ http://ir.ipe.ac.cn/bitstream/122111/10608/2/%e9%92%9d%e9%a1%b6%e8%9e%ba%e6%97%8b%e8%97%bb%e5%9c%a8%e4%b8%8d%e5%90%8c%e5%85%89%e7%85%a7%e6%9d%a1%e4%bb%b6%e4%b8%8b%e7%9a%84%e6%94%be%e6%b0%a7%e7%89%b9%e6%80%a7_%e8%96%9b%e5%8d%87%e9%95%bf.pdf
+
+-[ RECORD 3 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1060447
+base_url | http://www.irgrid.ac.cn/handle/1471x/1060447
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1060447
+-[ RECORD 4 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1671377
+base_url | http://ir.iggcas.ac.cn/handle/132A11/68622
+terminal_url | http://ir.iggcas.ac.cn/handle/132A11/68622
+-[ RECORD 5 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1178430
+base_url | http://www.irgrid.ac.cn/handle/1471x/1178430
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1178430
+-[ RECORD 6 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/2488017
+base_url | http://www.irgrid.ac.cn/handle/1471x/2488017
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/2488017
+-[ RECORD 7 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/977147
+base_url | http://www.irgrid.ac.cn/handle/1471x/977147
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/977147
+-[ RECORD 8 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/2454503
+base_url | http://ir.nwipb.ac.cn/handle/363003/9957
+terminal_url | http://ir.nwipb.ac.cn/handle/363003/9957
+
+this domain is a disapointment :(
+
+should continue crawling, as the metadata is open and good. but won't get fulltext?
+
+### hal (FIXED-PARTIAL)
+
+-[ RECORD 1 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00744951v1
+base_url | https://hal.archives-ouvertes.fr/hal-00744951
+terminal_url | https://hal.archives-ouvertes.fr/hal-00744951
+
+Off-site OA link.
+
+FIXED-HAL
+
+-[ RECORD 2 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-01065398v1
+base_url | https://hal.archives-ouvertes.fr/hal-01065398/file/AbstractSGE14_B_assaad.pdf
+terminal_url | https://hal.archives-ouvertes.fr/index/index
+-[ RECORD 3 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:lirmm-00371599v1
+base_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599
+terminal_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599
+
+To elsevier :(
+
+-[ RECORD 4 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00284780v1
+base_url | https://hal.archives-ouvertes.fr/hal-00284780
+terminal_url | https://hal.archives-ouvertes.fr/hal-00284780
+
+METADATA-ONLY
+
+-[ RECORD 5 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00186151v1
+base_url | https://hal.archives-ouvertes.fr/hal-00186151
+terminal_url | https://hal.archives-ouvertes.fr/hal-00186151
+
+METADATA-ONLY
+
+-[ RECORD 6 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00399754v1
+base_url | https://hal.archives-ouvertes.fr/hal-00399754
+terminal_url | https://hal.archives-ouvertes.fr/hal-00399754
+
+METADATA-ONLY
+
+
+### espace.library.uq.edu.au (SKIP)
+
+-[ RECORD 1 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:136497
+base_url | https://espace.library.uq.edu.au/view/UQ:136497
+terminal_url | https://espace.library.uq.edu.au/view/UQ:136497
+-[ RECORD 2 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:411389
+base_url | https://espace.library.uq.edu.au/view/UQ:411389
+terminal_url | https://espace.library.uq.edu.au/view/UQ:411389
+-[ RECORD 3 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:401773
+base_url | https://espace.library.uq.edu.au/view/UQ:401773
+terminal_url | https://espace.library.uq.edu.au/view/UQ:401773
+-[ RECORD 4 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:675334
+base_url | https://espace.library.uq.edu.au/view/UQ:675334
+terminal_url | https://espace.library.uq.edu.au/view/UQ:675334
+-[ RECORD 5 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:312311
+base_url | https://espace.library.uq.edu.au/view/UQ:312311
+terminal_url | https://espace.library.uq.edu.au/view/UQ:312311
+-[ RECORD 6 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:209401
+base_url | https://espace.library.uq.edu.au/view/UQ:209401
+terminal_url | https://espace.library.uq.edu.au/view/UQ:209401
+-[ RECORD 7 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:327188
+base_url | https://espace.library.uq.edu.au/view/UQ:327188
+terminal_url | https://espace.library.uq.edu.au/view/UQ:327188
+
+Very javascript heavy (skeletal HTML). And just links to fulltext on publisher
+sites.
+
+### igi.indrastra.com (METADATA-ONLY)
+
+-[ RECORD 1 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:267221
+base_url | http://igi.indrastra.com/items/show/267221
+terminal_url | http://igi.indrastra.com/items/show/267221
+-[ RECORD 2 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:181799
+base_url | http://igi.indrastra.com/items/show/181799
+terminal_url | http://igi.indrastra.com/items/show/181799
+-[ RECORD 3 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:125382
+base_url | http://igi.indrastra.com/items/show/125382
+terminal_url | http://igi.indrastra.com/items/show/125382
+-[ RECORD 4 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:47266
+base_url | http://igi.indrastra.com/items/show/47266
+terminal_url | http://igi.indrastra.com/items/show/47266
+-[ RECORD 5 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:12872
+base_url | http://igi.indrastra.com/items/show/12872
+terminal_url | http://igi.indrastra.com/items/show/12872
+-[ RECORD 6 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:231620
+base_url | http://igi.indrastra.com/items/show/231620
+terminal_url | http://igi.indrastra.com/items/show/231620
+
+"Proudly powered by Omeka"
+
+### invenio.nusl.cz (METADATA-ONLY)
+
+ oai_id | base_url | terminal_url
+----------------------------+------------------------------------+--------------------------------------
+ oai:invenio.nusl.cz:237409 | http://www.nusl.cz/ntk/nusl-237409 | http://invenio.nusl.cz/record/237409
+ oai:invenio.nusl.cz:180783 | http://www.nusl.cz/ntk/nusl-180783 | http://invenio.nusl.cz/record/180783
+ oai:invenio.nusl.cz:231961 | http://www.nusl.cz/ntk/nusl-231961 | http://invenio.nusl.cz/record/231961
+ oai:invenio.nusl.cz:318800 | http://www.nusl.cz/ntk/nusl-318800 | http://invenio.nusl.cz/record/318800
+ oai:invenio.nusl.cz:259695 | http://www.nusl.cz/ntk/nusl-259695 | http://invenio.nusl.cz/record/259695
+ oai:invenio.nusl.cz:167393 | http://www.nusl.cz/ntk/nusl-167393 | http://invenio.nusl.cz/record/167393
+ oai:invenio.nusl.cz:292987 | http://www.nusl.cz/ntk/nusl-292987 | http://invenio.nusl.cz/record/292987
+ oai:invenio.nusl.cz:283396 | http://www.nusl.cz/ntk/nusl-283396 | http://invenio.nusl.cz/record/283396
+ oai:invenio.nusl.cz:241512 | http://www.nusl.cz/ntk/nusl-241512 | http://invenio.nusl.cz/record/241512
+ oai:invenio.nusl.cz:178631 | http://www.nusl.cz/ntk/nusl-178631 | http://invenio.nusl.cz/record/178631
+
+Metadata only (at least this set)
+
+### hypotheses.org
+
+-[ RECORD 1 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:mittelalter/9529
+base_url | http://mittelalter.hypotheses.org/9529
+terminal_url | https://mittelalter.hypotheses.org/9529
+-[ RECORD 2 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivalia/18638
+base_url | http://archivalia.hypotheses.org/18638
+terminal_url | https://archivalia.hypotheses.org/18638
+-[ RECORD 3 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivalia/13614
+base_url | http://archivalia.hypotheses.org/13614
+terminal_url | https://archivalia.hypotheses.org/13614
+-[ RECORD 4 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:teteschercheuses/2785
+base_url | http://teteschercheuses.hypotheses.org/2785
+terminal_url | https://teteschercheuses.hypotheses.org/2785
+-[ RECORD 5 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:altervsego/608
+base_url | http://altervsego.hypotheses.org/608
+terminal_url | http://altervsego.hypotheses.org/608
+-[ RECORD 6 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivewk1/21905
+base_url | http://archivewk1.hypotheses.org/21905
+terminal_url | https://archivewk1.hypotheses.org/21905
+-[ RECORD 7 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:slkdiaspo/3321
+base_url | http://slkdiaspo.hypotheses.org/3321
+terminal_url | https://slkdiaspo.hypotheses.org/3321
+-[ RECORD 8 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:diga/280
+base_url | http://diga.hypotheses.org/280
+terminal_url | https://diga.hypotheses.org/280
+
+These are all a big mix... basically blogs. Should continue crawling, but expect no yield.
+
+### t2r2.star.titech.ac.jp (METADATA-ONLY)
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00105099
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00101346
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50161100
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00232407
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50120040
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50321440
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492
+-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50235666
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778
+
+
+### quod.lib.umich.edu
+
+-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acf2679.0015.003-2
+base_url | http://name.umdl.umich.edu/acf2679.0015.003
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0015.003
+-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:b14970.0001.001
+base_url | http://name.umdl.umich.edu/B14970.0001.001
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=eebo2;idno=B14970.0001.001
+-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acf2679.0009.010-3
+base_url | http://name.umdl.umich.edu/ACF2679-1623SOUT-209
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0009.010;node=acf2679.0009.010:3
+-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg2248.1-16.006-43
+base_url | http://name.umdl.umich.edu/acg2248.1-16.006
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-16.006
+-[ RECORD 5 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg2248.1-14.011-9
+base_url | http://name.umdl.umich.edu/ACG2248-1489LADI-364
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-14.011;node=acg2248.1-14.011:9
+-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg1336.1-24.006-9
+base_url | http://name.umdl.umich.edu/acg1336.1-24.006
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg1336.1-24.006
+-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:africanamer.0002.32a
+base_url | http://name.umdl.umich.edu/africanamer.0002.32a
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=africanamer;idno=africanamer.0002.32a
+
+These are... issues of journals? Should continue to crawl, but not expect much.
+
+### evastar-karlsruhe.de (METADATA-ONLY)
+
+-[ RECORD 1 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:270011444
+base_url | https://publikationen.bibliothek.kit.edu/270011444
+terminal_url | https://publikationen.bibliothek.kit.edu/270011444
+-[ RECORD 2 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:1000050117
+base_url | https://publikationen.bibliothek.kit.edu/1000050117
+terminal_url | https://publikationen.bibliothek.kit.edu/1000050117
+-[ RECORD 3 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:362296
+base_url | https://publikationen.bibliothek.kit.edu/362296
+terminal_url | https://publikationen.bibliothek.kit.edu/362296
+-[ RECORD 4 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:23042000
+base_url | https://publikationen.bibliothek.kit.edu/23042000
+terminal_url | https://publikationen.bibliothek.kit.edu/23042000
+-[ RECORD 5 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:1000069945
+base_url | https://publikationen.bibliothek.kit.edu/1000069945
+terminal_url | https://publikationen.bibliothek.kit.edu/1000069945
+
+
+### repository.ust.hk
+
+-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-67233
+base_url | http://repository.ust.hk/ir/Record/1783.1-67233
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-67233
+-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-63232
+base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=A1981KV47900017
+terminal_url | http://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253AA1981KV47900017%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=http%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com
+-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-2891
+base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=000240035400103
+terminal_url | https://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253A000240035400103%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=https%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com
+-[ RECORD 4 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-56231
+base_url | http://repository.ust.hk/ir/Record/1783.1-56231
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-56231
+
+[...]
+
+-[ RECORD 6 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-24872
+base_url | http://repository.ust.hk/ir/Record/1783.1-24872
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-24872
+-[ RECORD 7 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-3457
+base_url | http://lbdiscover.ust.hk/uresolver?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations
+terminal_url | http://lbdiscover.ust.hk/uresolver/?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations
+-[ RECORD 8 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-73215
+base_url | http://repository.ust.hk/ir/Record/1783.1-73215
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-73215
+
+DONE: gateway.isiknowledge.com is bogus/blocking?
+
+
+### edoc.mpg.de (SKIP-DEPRECATED)
+
+ oai_id | base_url | terminal_url
+------------------------+---------------------------+---------------------------
+ oai:edoc.mpg.de:416650 | http://edoc.mpg.de/416650 | http://edoc.mpg.de/416650
+ oai:edoc.mpg.de:8195 | http://edoc.mpg.de/8195 | http://edoc.mpg.de/8195
+ oai:edoc.mpg.de:379655 | http://edoc.mpg.de/379655 | http://edoc.mpg.de/379655
+ oai:edoc.mpg.de:641179 | http://edoc.mpg.de/641179 | http://edoc.mpg.de/641179
+ oai:edoc.mpg.de:607141 | http://edoc.mpg.de/607141 | http://edoc.mpg.de/607141
+ oai:edoc.mpg.de:544412 | http://edoc.mpg.de/544412 | http://edoc.mpg.de/544412
+ oai:edoc.mpg.de:314531 | http://edoc.mpg.de/314531 | http://edoc.mpg.de/314531
+ oai:edoc.mpg.de:405047 | http://edoc.mpg.de/405047 | http://edoc.mpg.de/405047
+ oai:edoc.mpg.de:239650 | http://edoc.mpg.de/239650 | http://edoc.mpg.de/239650
+ oai:edoc.mpg.de:614852 | http://edoc.mpg.de/614852 | http://edoc.mpg.de/614852
+
+This whole instance seems to have been replaced
+
+### bibliotecadigital.jcyl.es (SKIP-DIGITIZED)
+
+-[ RECORD 1 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:10000039962
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664
+-[ RECORD 2 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:14075
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075
+-[ RECORD 3 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:4842
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842
+-[ RECORD 4 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:14799
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799
+-[ RECORD 5 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:821
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474
+
+Digitized images as pages; too much to deal with for now.
+
+### orbi.ulg.ac.be
+
+-[ RECORD 1 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/128079
+base_url | https://orbi.uliege.be/handle/2268/128079
+terminal_url | https://orbi.uliege.be/handle/2268/128079
+-[ RECORD 2 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/67659
+base_url | https://orbi.uliege.be/handle/2268/67659
+terminal_url | https://orbi.uliege.be/handle/2268/67659
+-[ RECORD 3 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/35521
+base_url | https://orbi.uliege.be/handle/2268/35521
+terminal_url | https://orbi.uliege.be/handle/2268/35521
+-[ RECORD 4 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/107922
+base_url | https://orbi.uliege.be/handle/2268/107922
+terminal_url | https://orbi.uliege.be/handle/2268/107922
+-[ RECORD 5 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/215694
+base_url | https://orbi.uliege.be/handle/2268/215694
+terminal_url | https://orbi.uliege.be/handle/2268/215694
+
+Described below.
+
+### library.wur.nl (FIXED-BESPOKE)
+
+ oai_id | base_url | terminal_url
+ -----------------------------------+------------------------------------------------+------------------------------------------------
+ oai:library.wur.nl:wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939
+ oai:library.wur.nl:wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707
+ oai:library.wur.nl:wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208
+ oai:library.wur.nl:wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378
+ oai:library.wur.nl:wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416
+ oai:library.wur.nl:wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930
+ oai:library.wur.nl:wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076
+ oai:library.wur.nl:wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109
+ oai:library.wur.nl:wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146
+ oai:library.wur.nl:wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922
+ (10 rows)
+
+Seems like a one-off site? But added a pattern.
+
+### pure.atira.dk
+
+-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/a27762fd-0919-4753-af55-00b9b26d02e0
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html
+-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/215c8b96-a821-4947-bee4-c7470e9fbaf8
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html
+-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/95d4920a-12c7-4e25-b86c-5f075ea23a38
+base_url | https://www.tandfonline.com/doi/full/10.1080/03057070.2016.1197694
+terminal_url | https://www.tandfonline.com/action/cookieAbsent
+-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/8a2508ee-14c9-4c6a-851a-6db442090f41
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html
+
+Metadata only
+
+DONE: /cookieAbsent is cookie block
+ https://www.tandfonline.com/action/cookieAbsent
+
+### bib-pubdb1.desy.de (FIXED-INVENIO)
+
+-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:96756
+base_url | http://bib-pubdb1.desy.de/record/96756
+terminal_url | http://bib-pubdb1.desy.de/record/96756
+
+Metadata only.
+
+-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:416556
+base_url | http://bib-pubdb1.desy.de/record/416556
+terminal_url | http://bib-pubdb1.desy.de/record/416556
+
+Fixed!
+
+-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:414545
+base_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22
+terminal_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22
+-[ RECORD 5 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:170169
+base_url | http://bib-pubdb1.desy.de/record/170169
+terminal_url | http://bib-pubdb1.desy.de/record/170169
+-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:191154
+base_url | http://bib-pubdb1.desy.de/record/191154
+terminal_url | http://bib-pubdb1.desy.de/record/191154
+
+Metadata only
+
+-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:155092
+base_url | http://bib-pubdb1.desy.de/record/155092
+terminal_url | http://bib-pubdb1.desy.de/record/155092
+
+Fixed!
+
+-[ RECORD 8 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:97158
+base_url | http://bib-pubdb1.desy.de/record/97158
+terminal_url | http://bib-pubdb1.desy.de/record/97158
+
+Metadata only
+
+"Powered by Invenio v1.1.7"
+
+Can/should skip the "search" URLs
+
+### serval.unil.ch
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_60346fc75171
+base_url | https://serval.unil.ch/notice/serval:BIB_60346FC75171
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_60346FC75171
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_4db47fc4b593
+base_url | https://serval.unil.ch/notice/serval:BIB_4DB47FC4B593
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_4DB47FC4B593
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_57aac24fe115
+base_url | http://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154
+terminal_url | https://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_deabae6baf6c
+base_url | https://serval.unil.ch/notice/serval:BIB_DEABAE6BAF6C
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DEABAE6BAF6C
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_a5ec0df1370f
+base_url | https://serval.unil.ch/notice/serval:BIB_A5EC0DF1370F
+terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253Aed270c26d4a36cefd1bf6a840472abe0ee5556cb5f3b42de708f3ea984775dfd
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_080300c2e23c
+base_url | https://serval.unil.ch/resource/serval:BIB_080300C2E23C.P001/REF.pdf
+terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A154453d78a0fb75ffa220f7b6fe73b29447fa6ed048addf31897b41001f44679
+-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_de777dd2b07f
+base_url | https://serval.unil.ch/notice/serval:BIB_DE777DD2B07F
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DE777DD2B07F
+-[ RECORD 8 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_5e824e244c27
+base_url | https://serval.unil.ch/notice/serval:BIB_5E824E244C27
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_5E824E244C27
+
+Metadata only? See elsewhere.
+
+### Random Links
+
+-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dbc.wroc.pl:41031
+base_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031
+terminal_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031
+
+This is some platform/package thing. PDF is in an iframe. Platform is "DLibra".
+FIXED-DLIBRA
+
+-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/174291
+base_url | https://orbi.uliege.be/handle/2268/174291
+terminal_url | https://orbi.uliege.be/handle/2268/174291
+
+DSpace platform. There are multiple files, and little to "select" on.
+
+https://orbi.uliege.be/handle/2268/174200 has only single PDF and easier to work with
+
+PARTIAL-DSPACE
+
+-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.tue.nl:664163
+base_url | http://repository.tue.nl/664163
+terminal_url | http://repository.tue.nl/664163
+
+Ah, this is the Pure platform from Elsevier.
+Redirects to: https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance
+
+FIXED-PURE
+
+
+-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:49579
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22
+
+(handled above)
+
+-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.mit.edu:1721.1/97937
+base_url | https://orcid.org/0000-0002-2066-2082
+terminal_url | https://orcid.org/0000-0002-2066-2082
+
+ORCID! Skip it.
+
+DONE: skip orcid.org in `terminal_url`, and/or at harvest/transform time.
+
+-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:edoc.mpg.de:360269
+base_url | http://edoc.mpg.de/360269
+terminal_url | http://edoc.mpg.de/360269
+
+Seems like this whole repo has disapeared, or been replaced by... pure? maybe a different pure?
+
+DONE: edoc.mpg.de -> pure.mpg.de
+
+-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:books.openedition.org:msha/17716
+base_url | http://books.openedition.org/msha/17716
+terminal_url | https://books.openedition.org/msha/17716
+
+Open edition is free to read HTML, but not PDF (or epub, etc).
+
+TODO: for some? all? openedition books records, try HTML ingest (not PDF ingest)
+
+HTML-WORKED
+
+-[ RECORD 8 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:epub.oeaw.ac.at:0x003aba48
+base_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf
+terminal_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf
+
+requires login
+
+FORBIDDEN
+
+-[ RECORD 9 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.mit.edu:1721.1/88986
+base_url | https://orcid.org/0000-0002-4147-2560
+terminal_url | https://orcid.org/0000-0002-4147-2560
+
+DONE: skip orcids
+
+-[ RECORD 10 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-28786
+base_url | http://repository.ust.hk/ir/Record/1783.1-28786
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-28786
+
+Generator: VuFind 5.1.1
+just a metadata record
+
+METADATA-ONLY
+
+-[ RECORD 11 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:rcin.org.pl:50797
+base_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472
+terminal_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472
+
+Seems like a software platform? not sure.
+
+METADATA-ONLY
+
+-[ RECORD 12 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dea.lib.unideb.hu:2437/69641
+base_url | http://webpac.lib.unideb.hu:8082/WebPac/CorvinaWeb?action=cclfind&amp;resultview=long&amp;ccltext=idno+bibFSZ1008709
+terminal_url | https://webpac.lib.unideb.hu/WebPac/CorvinaWeb?action=cclfind&amp;resultview=long&amp;ccltext=idno+bibFSZ1008709
+
+-[ RECORD 13 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:unsworks.library.unsw.edu.au:1959.4/64871
+base_url | http://handle.unsw.edu.au/1959.4/64871
+terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_62832&context=L
+
+-[ RECORD 14 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:www.wbc.poznan.pl:225930
+base_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930
+terminal_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930
+
+SOFT-404
+
+-[ RECORD 15 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.erciyes.edu.tr:105
+base_url | http://repository.erciyes.edu.tr/bilimname/items/show/105
+terminal_url | http://repository.erciyes.edu.tr:80/bilimname/items/show/105
+
+GONE (domain not registered)
+
+-[ RECORD 16 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digi.ub.uni-heidelberg.de:37500
+base_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13
+terminal_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13
+
+Seems like a bespoke site
+
+SKIP-BESPOKE
+
+-[ RECORD 17 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50401364
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313
+
+METADATA-ONLY
+
+-[ RECORD 18 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:epubs.cclrc.ac.uk:work/4714
+base_url | http://purl.org/net/epubs/work/4714
+terminal_url | https://epubs.stfc.ac.uk/work/4714
+
+It's got a purl! haha.
+
+METADATA-ONLY
+
+------
+
+Another batch! With some repeat domains removed.
+
+-[ RECORD 1 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:cris.vtt.fi:persons/142c030f-ba7b-491a-8669-a361088355cc
+base_url | https://cris.vtt.fi/en/persons/142c030f-ba7b-491a-8669-a361088355cc
+terminal_url | https://cris.vtt.fi/en/persons/oleg-antropov
+
+SKIP
+
+-[ RECORD 2 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:etd.adm.unipi.it:etd-05302014-183910
+base_url | http://etd.adm.unipi.it/theses/available/etd-05302014-183910/
+terminal_url | https://etd.adm.unipi.it/theses/available/etd-05302014-183910/
+
+Some software platform? Pretty basic/bespoke
+
+FIXED-PARTIAL
+
+-[ RECORD 3 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:10000098246
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451
+
+SKIP (see elsewhere)
+
+-[ RECORD 7 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:elektra.cdaea.es:documento.29259
+base_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259
+terminal_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259
+
+Photo.
+
+SKIP-SCOPE
+
+-[ RECORD 9 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:unsworks.library.unsw.edu.au:1959.4/unsworks_60829
+base_url | http://handle.unsw.edu.au/1959.4/unsworks_60829
+terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_modsunsworks_60829&context=L
+
+METADATA-ONLY
+
+-[ RECORD 12 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.leuphana.de:publications/7d040cf2-b3b5-4671-8906-76b5bc8d870a
+base_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html
+terminal_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html
+
+unsure
+
+-[ RECORD 16 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.wur.nl:wurpubs/369344
+base_url | https://library.wur.nl/WebQuery/wurpubs/369344
+terminal_url | https://library.wur.nl/WebQuery/wurpubs/369344
+
+this specific record not OA (but site is fine/fixed)
+
+-[ RECORD 17 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:escholarship.umassmed.edu:oapubs-2146
+base_url | https://escholarship.umassmed.edu/oapubs/1147
+terminal_url | http://escholarship.umassmed.edu/oapubs/1147/
+
+just links to publisher (no content in repo)
+
+-[ RECORD 18 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digitalcommons.usu.edu:wild_facpub-1010
+base_url | https://digitalcommons.usu.edu/wild_facpub/11
+terminal_url | http://digitalcommons.usu.edu/wild_facpub/11/
+
+also just links to publisher (no content in repo)
+
+-[ RECORD 25 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:igi.indrastra.com:306768
+base_url | http://igi.indrastra.com/items/show/306768
+terminal_url | http://igi.indrastra.com/items/show/306768
+
+(see elsewhere)
+
+-[ RECORD 26 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:fau.digital.flvc.org:fau_9804
+base_url | http://purl.flvc.org/fcla/dt/12932
+terminal_url | http://fau.digital.flvc.org/islandora/object/fau%3A9804
+
+Islandora.
+
+-[ RECORD 27 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.lu.lv:7/16019
+base_url | https://dspace.lu.lv/dspace/handle/7/16019
+terminal_url | https://dspace.lu.lv/dspace/handle/7/16019
+
+LOGINWALL
+
+-[ RECORD 28 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:zir.nsk.hr:umas_218
+base_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218
+terminal_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218
+
+REMOVED
+
+
+-[ RECORD 29 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digi.ub.uni-heidelberg.de:36390
+base_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5
+terminal_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5
+
+Book, with chapters, not an individual work.
+
+-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:krm.or.kr:10056135m201r
+base_url | https://www.krm.or.kr/krmts/link.html?dbGubun=SD&m201_id=10056135&res=y
+terminal_url | https://www.krm.or.kr/krmts/search/detailview/research.html?dbGubun=SD&category=Research&m201_id=10056135
+
+research results repository; keep crawling
+
+SKIP-SCOPE
+
+-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:www.db-thueringen.de:dbt_mods_00005191
+base_url | https://www.db-thueringen.de/receive/dbt_mods_00005191
+terminal_url | https://www.db-thueringen.de/receive/dbt_mods_00005191
+
+powered by "MyCoRe"
+
+FIXED-MYCORE
+
+-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bibliotecavirtualandalucia.juntadeandalucia.es:1017405
+base_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405
+terminal_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405
+
+seems to be a general purpose regional library? not research-specific
+
+SKIP-UNSURE
+
+-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:etd.adm.unipi.it:etd-02272019-123644
+base_url | http://etd.adm.unipi.it/theses/available/etd-02272019-123644/
+terminal_url | https://etd.adm.unipi.it/theses/available/etd-02272019-123644/
+
+This specific URL is not available (FORBIDDEN)
+
+others have multiple files, not just a single PDF:
+https://etd.adm.unipi.it/t/etd-09102013-124430/
+
+SKIP-UNSURE
+
+-[ RECORD 9 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:commons.ln.edu.hk:sw_master-5408
+base_url | https://commons.ln.edu.hk/sw_master/4408
+terminal_url | https://commons.ln.edu.hk/sw_master/4408/
+
+worth crawling I guess
+
+METADATA-ONLY
+
+-[ RECORD 10 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:mouseion.jax.org:ssbb1976-1224
+base_url | https://mouseion.jax.org/ssbb1976/225
+terminal_url | https://mouseion.jax.org/ssbb1976/225/
+
+METADATA-ONLY
+
+-[ RECORD 13 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:aleph.bib-bvb.de:bvb01-016604343
+base_url | http://bvbm1.bib-bvb.de/webclient/DeliveryManager?pid=176332&custom_att_2=simple_viewer
+terminal_url | http://digital.bib-bvb.de/view/action/singleViewer.do?dvs=1593269021002~476&locale=en_US&VIEWER_URL=/view/action/singleViewer.do?&DELIVERY_RULE_ID=31&frameId=1&usePid1=true&usePid2=true
+
+SOFT-404 / FORBIDDEN (cookie timeout)
+
+-[ RECORD 14 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bivaldi.gva.es:11740
+base_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740
+terminal_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740
+
+
+-[ RECORD 16 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.wur.nl:wurpubs/443282
+base_url | https://library.wur.nl/WebQuery/wurpubs/443282
+terminal_url | https://library.wur.nl/WebQuery/wurpubs/443282
+
+DIGIBIS platform (like some others)
+
+FIXED-PARTIAL
+
+-[ RECORD 18 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:hal:in2p3-00414135v1
+base_url | http://hal.in2p3.fr/in2p3-00414135
+terminal_url | http://hal.in2p3.fr:80/in2p3-00414135
+
+METADATA-ONLY
+
+-[ RECORD 19 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:aaltodoc.aalto.fi:123456789/13201
+base_url | https://aaltodoc.aalto.fi/handle/123456789/13201
+terminal_url | https://aaltodoc.aalto.fi/handle/123456789/13201
+
+This specific record is not accessible.
+Another: https://aaltodoc.aalto.fi/handle/123456789/38002
+
+DSpace 5.4
+
+Worked (from recent changes)
+
+
+-[ RECORD 20 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:sedici.unlp.edu.ar:10915/40144
+base_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view
+terminal_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view
+
+This is a journal! Cool. Plone software platform.
+
+FIXED
+
+## Top no-capture Domains
+
+Top terminal no-capture domains:
+
+ SELECT domain, COUNT(domain)
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_file_result.status = 'no-capture'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | count
+ -----------------------------------+-------
+ digitalrepository.unm.edu | 94087
+ escholarship.org | 80632
+ ir.opt.ac.cn | 70504
+ idus.us.es | 67908
+ www.cambridge.org | 56376
+ www.ssoar.info | 52534
+ rep.bntu.by | 52127
+ scholarworks.umt.edu | 48546
+ publikationen.ub.uni-frankfurt.de | 46987
+ dk.um.si | 45753
+ repositorio.uladech.edu.pe | 37028
+ uu.diva-portal.org | 34929
+ digitalcommons.law.byu.edu | 31732
+ sedici.unlp.edu.ar | 31233
+ elib.sfu-kras.ru | 29131
+ jyx.jyu.fi | 28144
+ www.repository.cam.ac.uk | 27728
+ nagoya.repo.nii.ac.jp | 26673
+ www.duo.uio.no | 25258
+ www.persee.fr | 24968
+ www2.senado.leg.br | 24426
+ tesis.ucsm.edu.pe | 24049
+ digitalcommons.unl.edu | 21974
+ www.degruyter.com | 21940
+ www.igi-global.com | 20736
+ thekeep.eiu.edu | 20712
+ docs.lib.purdue.edu | 20538
+ repositorio.cepal.org | 20280
+ elib.bsu.by | 19620
+ minds.wisconsin.edu | 19473
+ (30 rows)
+
+These all seem worth crawling. A couple publishers (cambridge.org), and
+persee.fr will probably fail, but not too many URLs.
+
+## Summary of Filtered Prefixes and Domains (OAI-PMH)
+
+oai:kb.dk:
+ too large and generic
+oai:bdr.oai.bsb-muenchen.de:
+ too large and generic
+oai:hispana.mcu.es:
+ too large and generic
+oai:bnf.fr:
+ too large and generic
+oai:ukm.si:
+ too large and generic
+oai:biodiversitylibrary.org:
+ redundant with other ingest and archive.org content
+oai:hsp.org:
+ large; historical content only
+oai:repec:
+ large; mostly (entirely?) links to publisher sites
+oai:n/a:
+ meta?
+oai:quod.lib.umich.edu:
+ entire issues? hard to crawl so skip for now
+oai:hypotheses.org:
+ HTML, not PDF
+oai:americanae.aecid.es:
+ large, complex. skip for now
+oai:www.irgrid.ac.cn:
+ aggregator of other IRs
+oai:espace.library.uq.edu.au:
+ large; metadata only; javascript heavy (poor heritrix crawling)
+oai:edoc.mpg.de:
+ deprecated domain, with no redirects
+oai:bibliotecadigital.jcyl.es:
+ digitized historical docs; hard to crawl, skip for now
+oai:repository.erciyes.edu.tr:
+ gone (domain lapsed)
+oai:krm.or.kr:
+ "research results repository" (metadata only)
+
+www.kb.dk
+ large, general purpose, scope
+kb-images.kb.dk
+ deprecated
+mdz-nbn-resolving.de
+ multiple prefixes end up here. historical docs, scope
+aggr.ukm.um.si
+ large, out of scope
+edoc.mpg.de
+ deprecated domain
+doaj.org
+ index (metadata only)
+orcid.org
+ out of scope
+gateway.isiknowledge.com
+ clarivate login/payall (skipping in ingest)
+
+Needs filtering to a subset of records (by 'set' or other filtering?):
+
+oai:igi.indrastra.com:
+oai:invenio.nusl.cz:
+oai:t2r2.star.titech.ac.jp:
+oai:evastar-karlsruhe.de:
+oai:repository.ust.hk:
+oai:serval.unil.ch:
+oai:pure.atira.dk:
+
+FIlters in SQL syntax:
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+
+and in some contexts (PDFs; switch to HTML):
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+## Overall Summary of OAI-PMH Stuff
+
+Big picture is that the majority of `no-pdf-link` crawl status are because of
+repository scope, record scope, or content format issues. That being said,
+there was a sizable fraction of sites which were platforms (like DSpace) which
+were not ingesting well.
+
+A significant fraction of records are "metadata only" (of papers), or non-paper
+entity types (like persons, grants, or journal titles), and a growing fraction
+(?) are metadata plus link to OA publisher fulltext (offsite). Might be
+possible to detect these at ingest time, or earlier at OAI-PMH
+harvest/transform time and filter them out.
+
+It may be worthwhile to attempt ingest of multiple existing captures
+(timestamps) in the ingest pipeline. Eg, instead of chosing a single "best"
+capture, if there are multiple HTTP 200 status captures, try ingest with each
+(or at least a couple). This is because repository software gets upgraded, so
+old "no-capture" or "not found" or "link loop" type captures may work when
+recrawled.
+
+New summary with additional filters:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -----------------------+----------
+ success | 12872279
+ no-pdf-link | 9329602
+ no-capture | 4696362
+ redirect-loop | 1541458
+ terminal-bad-status | 660418
+ link-loop | 452831
+ wrong-mimetype | 434868
+ null-body | 71065
+ cdx-error | 17005
+ | 15275
+ petabox-error | 12743
+ wayback-error | 11759
+ skip-url-blocklist | 182
+ gateway-timeout | 122
+ redirects-exceeded | 120
+ bad-redirect | 117
+ bad-gzip-encoding | 111
+ wayback-content-error | 102
+ timeout | 72
+ blocked-cookie | 62
+ (20 rows)
+
diff --git a/notes/ingest/2021-09-03_daily_improvements.md b/notes/ingest/2021-09-03_daily_improvements.md
new file mode 100644
index 0000000..a0bb0c5
--- /dev/null
+++ b/notes/ingest/2021-09-03_daily_improvements.md
@@ -0,0 +1,1021 @@
+
+Periodic check-in of daily crawling/ingest.
+
+Overall ingest status, past 30 days:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ ingest_type | status | count
+ -------------+-------------------------+--------
+ pdf | no-pdf-link | 158474
+ pdf | spn2-cdx-lookup-failure | 135344
+ pdf | success | 127938
+ pdf | spn2-error | 65411
+ pdf | gateway-timeout | 63112
+ pdf | blocked-cookie | 26338
+ pdf | terminal-bad-status | 24853
+ pdf | link-loop | 15699
+ pdf | spn2-error:job-failed | 13862
+ pdf | redirect-loop | 11432
+ pdf | cdx-error | 2376
+ pdf | too-many-redirects | 2186
+ pdf | wrong-mimetype | 2142
+ pdf | forbidden | 1758
+ pdf | spn2-error:no-status | 972
+ pdf | not-found | 820
+ pdf | bad-redirect | 536
+ pdf | read-timeout | 392
+ pdf | wayback-error | 251
+ pdf | remote-server-error | 220
+ (20 rows)
+
+Hrm, that is a healthy fraction of `no-pdf-link`.
+
+Broken domains, past 30 days:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ -------------------------+-------------------------+-------
+ zenodo.org | no-pdf-link | 39678
+ osf.io | gateway-timeout | 29809
+ acervus.unicamp.br | no-pdf-link | 21978
+ osf.io | terminal-bad-status | 18727
+ zenodo.org | spn2-cdx-lookup-failure | 17008
+ doi.org | spn2-cdx-lookup-failure | 15503
+ www.degruyter.com | no-pdf-link | 15122
+ ieeexplore.ieee.org | spn2-error:job-failed | 12921
+ osf.io | spn2-cdx-lookup-failure | 11123
+ www.tandfonline.com | blocked-cookie | 8096
+ www.morressier.com | no-pdf-link | 4655
+ ieeexplore.ieee.org | spn2-cdx-lookup-failure | 4580
+ pubs.acs.org | blocked-cookie | 4415
+ www.frontiersin.org | no-pdf-link | 4163
+ www.degruyter.com | spn2-cdx-lookup-failure | 3788
+ www.taylorfrancis.com | no-pdf-link | 3568
+ www.sciencedirect.com | no-pdf-link | 3128
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 3116
+ acervus.unicamp.br | spn2-cdx-lookup-failure | 2797
+ www.mdpi.com | spn2-cdx-lookup-failure | 2719
+ brill.com | link-loop | 2681
+ linkinghub.elsevier.com | spn2-cdx-lookup-failure | 2657
+ www.sciencedirect.com | spn2-cdx-lookup-failure | 2546
+ apps.crossref.org | no-pdf-link | 2537
+ onlinelibrary.wiley.com | blocked-cookie | 2528
+ (25 rows)
+
+Summary of significant domains and status, past 30 days, minus spn2-cdx-lookup-failure:
+
+ SELECT domain, status, count
+ FROM (
+ SELECT domain, status, COUNT((domain, status)) as count
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ AND ingest_file_result.status != 'spn2-cdx-lookup-failure'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY CUBE (domain, status)
+ ) t2
+ WHERE count > 200
+ ORDER BY domain ASC , count DESC;
+
+
+ domain | status | count
+ -----------------------------------------------------------------+-----------------------+--------
+ academic.oup.com | | 2405
+ academic.oup.com | no-pdf-link | 1240
+ academic.oup.com | link-loop | 1010
+ acervus.unicamp.br | | 21980
+ acervus.unicamp.br | no-pdf-link | 21978 **
+ aclanthology.org | | 208
+ acp.copernicus.org | | 365
+ acp.copernicus.org | success | 356
+ aip.scitation.org | | 1071
+ aip.scitation.org | blocked-cookie | 843
+ aip.scitation.org | redirect-loop | 227
+ apps.crossref.org | | 2537
+ apps.crossref.org | no-pdf-link | 2537
+ arxiv.org | | 17817
+ arxiv.org | success | 17370
+ arxiv.org | terminal-bad-status | 320
+ asmedigitalcollection.asme.org | | 401
+ asmedigitalcollection.asme.org | link-loop | 364
+ assets.researchsquare.com | | 3706
+ assets.researchsquare.com | success | 3706
+ avmj.journals.ekb.eg | | 605
+ avmj.journals.ekb.eg | success | 595
+ bfa.journals.ekb.eg | | 224
+ bfa.journals.ekb.eg | success | 214
+ biorxiv.org | redirect-loop | 895
+ biorxiv.org | | 895
+ birdsoftheworld.org | | 286
+ birdsoftheworld.org | no-pdf-link | 285
+ bmjopen.bmj.com | success | 232
+ bmjopen.bmj.com | | 232
+ books.openedition.org | | 396
+ books.openedition.org | no-pdf-link | 396
+ brill.com | | 4272
+ brill.com | link-loop | 2681
+ brill.com | no-pdf-link | 1410
+ cas.columbia.edu | | 1038
+ cas.columbia.edu | no-pdf-link | 1038 **
+ cdr.lib.unc.edu | | 513
+ cdr.lib.unc.edu | success | 469
+ chemrxiv.org | | 278
+ chemrxiv.org | success | 275
+ classiques-garnier.com | | 531
+ classiques-garnier.com | no-pdf-link | 487 *
+ content.iospress.com | | 275
+ content.iospress.com | link-loop | 230
+ cris.maastrichtuniversity.nl | | 318
+ cris.maastrichtuniversity.nl | success | 284
+ cyberleninka.ru | | 1165
+ cyberleninka.ru | success | 1134
+ deepblue.lib.umich.edu | | 289
+ dergipark.org.tr | | 1185
+ dergipark.org.tr | success | 774
+ dergipark.org.tr | no-pdf-link | 320
+ didaktorika.gr | | 688
+ didaktorika.gr | redirect-loop | 688
+ digi.ub.uni-heidelberg.de | | 292
+ digi.ub.uni-heidelberg.de | no-pdf-link | 292
+ direct.mit.edu | | 236
+ direct.mit.edu | no-pdf-link | 207 *
+ dl.acm.org | | 2319
+ dl.acm.org | blocked-cookie | 2230
+ dmtcs.episciences.org | | 733
+ dmtcs.episciences.org | success | 730
+ doi.ala.org.au | no-pdf-link | 2373 **
+ doi.ala.org.au | | 2373
+ doi.org | | 732
+ doi.org | terminal-bad-status | 673
+ downloads.hindawi.com | success | 1452
+ downloads.hindawi.com | | 1452
+ drive.google.com | | 216
+ drive.google.com | no-pdf-link | 211
+ dtb.bmj.com | | 674
+ dtb.bmj.com | link-loop | 669
+ easy.dans.knaw.nl | no-pdf-link | 261 *
+ easy.dans.knaw.nl | | 261
+ ebooks.marilia.unesp.br | | 688
+ ebooks.marilia.unesp.br | no-pdf-link | 688 *
+ ehp.niehs.nih.gov | | 766
+ ehp.niehs.nih.gov | blocked-cookie | 765
+ ejournal.mandalanursa.org | | 307
+ ejournal.mandalanursa.org | success | 305
+ elib.spbstu.ru | | 264
+ elib.spbstu.ru | redirect-loop | 257
+ elibrary.ru | | 1367
+ elibrary.ru | redirect-loop | 1169
+ elibrary.vdi-verlag.de | | 1251
+ elibrary.vdi-verlag.de | no-pdf-link | 646
+ elibrary.vdi-verlag.de | link-loop | 537
+ elifesciences.org | | 328
+ elifesciences.org | success | 323
+ figshare.com | | 803
+ figshare.com | no-pdf-link | 714 *
+ files.osf.io | | 745
+ files.osf.io | success | 614
+ hammer.purdue.edu | | 244
+ hammer.purdue.edu | no-pdf-link | 243
+ heiup.uni-heidelberg.de | | 277
+ heiup.uni-heidelberg.de | no-pdf-link | 268
+ hkvalidate.perfdrive.com | no-pdf-link | 370 *
+ hkvalidate.perfdrive.com | | 370
+ ieeexplore.ieee.org | | 16675
+ ieeexplore.ieee.org | spn2-error:job-failed | 12927
+ ieeexplore.ieee.org | success | 1952
+ ieeexplore.ieee.org | too-many-redirects | 1193
+ ieeexplore.ieee.org | no-pdf-link | 419
+ jamanetwork.com | | 339
+ jamanetwork.com | success | 216
+ jmstt.ntou.edu.tw | | 244
+ jmstt.ntou.edu.tw | success | 241
+ journal.ipb.ac.id | | 229
+ journal.ipb.ac.id | success | 206
+ journal.nafe.org | | 221
+ journals.aps.org | | 614
+ journals.aps.org | gateway-timeout | 495
+ journals.asm.org | | 463
+ journals.asm.org | blocked-cookie | 435
+ journals.flvc.org | | 230
+ journals.lww.com | | 1300
+ journals.lww.com | link-loop | 1284
+ journals.openedition.org | | 543
+ journals.openedition.org | success | 311
+ journals.ub.uni-heidelberg.de | | 357
+ journals.ub.uni-heidelberg.de | success | 311
+ jov.arvojournals.org | | 431
+ jov.arvojournals.org | no-pdf-link | 422 *
+ kiss.kstudy.com | | 303
+ kiss.kstudy.com | no-pdf-link | 303 *
+ library.iated.org | | 364
+ library.iated.org | redirect-loop | 264
+ library.seg.org | blocked-cookie | 301
+ library.seg.org | | 301
+ link.aps.org | redirect-loop | 442
+ link.aps.org | | 442
+ linkinghub.elsevier.com | | 515
+ linkinghub.elsevier.com | gateway-timeout | 392
+ mc.sbm.org.br | | 224
+ mc.sbm.org.br | success | 224
+ mdpi-res.com | | 742
+ mdpi-res.com | success | 742
+ mdsoar.org | | 220
+ mediarep.org | | 269
+ mediarep.org | success | 264
+ medrxiv.org | redirect-loop | 290
+ medrxiv.org | | 290
+ muse.jhu.edu | | 429
+ muse.jhu.edu | terminal-bad-status | 391
+ mvmj.journals.ekb.eg | | 306
+ oapub.org | | 292
+ oapub.org | success | 289
+ onepetro.org | | 426
+ onepetro.org | link-loop | 406
+ onlinelibrary.wiley.com | | 2835
+ onlinelibrary.wiley.com | blocked-cookie | 2531
+ onlinelibrary.wiley.com | redirect-loop | 264
+ open.library.ubc.ca | | 569
+ open.library.ubc.ca | no-pdf-link | 425 *
+ opendata.uni-halle.de | | 407
+ opendata.uni-halle.de | success | 263
+ osf.io | | 49022
+ osf.io | gateway-timeout | 29810
+ osf.io | terminal-bad-status | 18731
+ osf.io | spn2-error | 247
+ osf.io | not-found | 205
+ oxford.universitypressscholarship.com | | 392
+ oxford.universitypressscholarship.com | link-loop | 233
+ panor.ru | no-pdf-link | 433 *
+ panor.ru | | 433
+ papers.ssrn.com | | 1630
+ papers.ssrn.com | link-loop | 1598
+ pdf.sciencedirectassets.com | | 3063
+ pdf.sciencedirectassets.com | success | 3063
+ peerj.com | | 464
+ peerj.com | no-pdf-link | 303 *
+ periodicos.ufpe.br | | 245
+ periodicos.ufpe.br | success | 232
+ periodicos.unb.br | | 230
+ periodicos.unb.br | success | 221
+ preprints.jmir.org | | 548
+ preprints.jmir.org | cdx-error | 499
+ publications.rwth-aachen.de | | 213
+ publikationen.bibliothek.kit.edu | | 346
+ publikationen.bibliothek.kit.edu | success | 314
+ publikationen.uni-tuebingen.de | | 623
+ publikationen.uni-tuebingen.de | no-pdf-link | 522 *
+ publons.com | no-pdf-link | 934 *
+ publons.com | | 934
+ pubs.acs.org | | 4507
+ pubs.acs.org | blocked-cookie | 4406
+ pubs.rsc.org | | 1638
+ pubs.rsc.org | link-loop | 1054
+ pubs.rsc.org | redirect-loop | 343
+ pubs.rsc.org | success | 201
+ repositorio.ufu.br | | 637
+ repositorio.ufu.br | success | 607
+ repository.dri.ie | | 1852
+ repository.dri.ie | no-pdf-link | 1852 **
+ repository.library.brown.edu | | 293
+ repository.library.brown.edu | no-pdf-link | 291 *
+ res.mdpi.com | | 10367
+ res.mdpi.com | success | 10360
+ retrovirology.biomedcentral.com | | 230
+ revistas.ufrj.br | | 284
+ revistas.ufrj.br | success | 283
+ revistas.uptc.edu.co | | 385
+ revistas.uptc.edu.co | success | 344
+ royalsocietypublishing.org | | 231
+ rsdjournal.org | | 347
+ rsdjournal.org | success | 343
+ s3-ap-southeast-2.amazonaws.com | | 400
+ s3-ap-southeast-2.amazonaws.com | success | 392
+ s3-eu-west-1.amazonaws.com | | 2096
+ s3-eu-west-1.amazonaws.com | success | 2091
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 289
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 286
+ s3.ca-central-1.amazonaws.com | | 202
+ sage.figshare.com | | 242
+ sage.figshare.com | no-pdf-link | 241
+ sajeb.org | | 246
+ sajeb.org | no-pdf-link | 243
+ scholar.dkyobobook.co.kr | | 332
+ scholar.dkyobobook.co.kr | no-pdf-link | 328 *
+ search.mandumah.com | | 735
+ search.mandumah.com | redirect-loop | 726
+ secure.jbs.elsevierhealth.com | | 1112
+ secure.jbs.elsevierhealth.com | blocked-cookie | 1108
+ stm.bookpi.org | no-pdf-link | 468 *
+ stm.bookpi.org | | 468
+ storage.googleapis.com | | 1012
+ storage.googleapis.com | success | 1012
+ tandf.figshare.com | | 469
+ tandf.figshare.com | no-pdf-link | 466
+ teses.usp.br | | 739
+ teses.usp.br | success | 730
+ tidsskrift.dk | | 360
+ tidsskrift.dk | success | 346
+ tiedejaedistys.journal.fi | | 224
+ tind-customer-agecon.s3.amazonaws.com | success | 332
+ tind-customer-agecon.s3.amazonaws.com | | 332
+ valep.vc.univie.ac.at | no-pdf-link | 280
+ valep.vc.univie.ac.at | | 280
+ watermark.silverchair.com | | 1729
+ watermark.silverchair.com | success | 1719
+ www.academia.edu | | 387
+ www.academia.edu | no-pdf-link | 386
+ www.ahajournals.org | | 430
+ www.ahajournals.org | blocked-cookie | 413
+ www.atenaeditora.com.br | | 572
+ www.atenaeditora.com.br | terminal-bad-status | 513
+ www.atlantis-press.com | success | 722
+ www.atlantis-press.com | | 722
+ www.aup-online.com | | 419
+ www.aup-online.com | no-pdf-link | 419 *
+ www.beck-elibrary.de | | 269
+ www.beck-elibrary.de | no-pdf-link | 268 *
+ www.biodiversitylibrary.org | no-pdf-link | 528 *
+ www.biodiversitylibrary.org | | 528
+ www.bloomsburycollections.com | | 623
+ www.bloomsburycollections.com | no-pdf-link | 605 *
+ www.cabi.org | | 2191
+ www.cabi.org | no-pdf-link | 2186 *
+ www.cairn.info | | 1283
+ www.cairn.info | no-pdf-link | 713
+ www.cairn.info | link-loop | 345
+ www.cambridge.org | | 4128
+ www.cambridge.org | no-pdf-link | 1531
+ www.cambridge.org | success | 1441
+ www.cambridge.org | link-loop | 971
+ www.cureus.com | no-pdf-link | 526 *
+ www.cureus.com | | 526
+ www.dbpia.co.kr | | 637
+ www.dbpia.co.kr | redirect-loop | 631
+ www.deboni.he.com.br | | 382
+ www.deboni.he.com.br | success | 381
+ www.degruyter.com | | 17783
+ www.degruyter.com | no-pdf-link | 15102
+ www.degruyter.com | success | 2584
+ www.dovepress.com | | 480
+ www.dovepress.com | success | 472
+ www.e-manuscripta.ch | | 1350
+ www.e-manuscripta.ch | no-pdf-link | 1350 *
+ www.e-periodica.ch | | 1276
+ www.e-periodica.ch | no-pdf-link | 1275
+ www.e-rara.ch | | 202
+ www.e-rara.ch | no-pdf-link | 202
+ www.elgaronline.com | | 495
+ www.elgaronline.com | link-loop | 290
+ www.elibrary.ru | | 922
+ www.elibrary.ru | no-pdf-link | 904
+ www.emerald.com | | 2155
+ www.emerald.com | no-pdf-link | 1936 *
+ www.emerald.com | success | 219
+ www.eurekaselect.com | | 518
+ www.eurekaselect.com | no-pdf-link | 516 *
+ www.frontiersin.org | | 4163
+ www.frontiersin.org | no-pdf-link | 4162 **
+ www.hanser-elibrary.com | | 444
+ www.hanser-elibrary.com | blocked-cookie | 444
+ www.hanspub.org | | 334
+ www.hanspub.org | no-pdf-link | 314
+ www.idunn.no | | 1736
+ www.idunn.no | link-loop | 596
+ www.idunn.no | success | 577
+ www.idunn.no | no-pdf-link | 539
+ www.igi-global.com | terminal-bad-status | 458
+ www.igi-global.com | | 458
+ www.ijcai.org | | 533
+ www.ijcai.org | success | 532
+ www.ijraset.com | success | 385
+ www.ijraset.com | | 385
+ www.inderscience.com | | 712
+ www.inderscience.com | no-pdf-link | 605 *
+ www.ingentaconnect.com | | 456
+ www.ingentaconnect.com | no-pdf-link | 413 *
+ www.internationaljournalssrg.org | | 305
+ www.internationaljournalssrg.org | no-pdf-link | 305 *
+ www.isca-speech.org | | 2392
+ www.isca-speech.org | no-pdf-link | 2391 **
+ www.journals.uchicago.edu | | 228
+ www.journals.uchicago.edu | blocked-cookie | 227
+ www.jstage.jst.go.jp | | 1492
+ www.jstage.jst.go.jp | success | 1185
+ www.jstage.jst.go.jp | no-pdf-link | 289
+ www.jstor.org | | 301
+ www.jurology.com | | 887
+ www.jurology.com | redirect-loop | 887
+ www.karger.com | | 318
+ www.liebertpub.com | | 507
+ www.liebertpub.com | blocked-cookie | 496
+ www.morressier.com | | 4781
+ www.morressier.com | no-pdf-link | 4655 **
+ www.ncl.ecu.edu | | 413
+ www.ncl.ecu.edu | success | 413
+ www.nomos-elibrary.de | | 526
+ www.nomos-elibrary.de | no-pdf-link | 391
+ www.oecd-ilibrary.org | no-pdf-link | 1170 **
+ www.oecd-ilibrary.org | | 1170
+ www.openagrar.de | no-pdf-link | 221
+ www.openagrar.de | | 221
+ www.osapublishing.org | | 900
+ www.osapublishing.org | link-loop | 615
+ www.osapublishing.org | no-pdf-link | 269
+ www.osti.gov | | 630
+ www.osti.gov | link-loop | 573
+ www.oxfordlawtrove.com | no-pdf-link | 476 *
+ www.oxfordlawtrove.com | | 476
+ www.pdcnet.org | | 298
+ www.pdcnet.org | terminal-bad-status | 262
+ www.pedocs.de | | 203
+ www.pnas.org | | 222
+ www.preprints.org | | 372
+ www.preprints.org | success | 366
+ www.repository.cam.ac.uk | | 801
+ www.repository.cam.ac.uk | success | 359
+ www.repository.cam.ac.uk | no-pdf-link | 239
+ www.research-collection.ethz.ch | | 276
+ www.research-collection.ethz.ch | terminal-bad-status | 274
+ www.revistas.usp.br | | 207
+ www.revistas.usp.br | success | 204
+ www.rina.org.uk | no-pdf-link | 1009 **
+ www.rina.org.uk | | 1009
+ www.schweizerbart.de | no-pdf-link | 202
+ www.schweizerbart.de | | 202
+ www.scielo.br | | 544
+ www.scielo.br | redirect-loop | 526
+ www.sciencedirect.com | | 3901
+ www.sciencedirect.com | no-pdf-link | 3127 **
+ www.sciencedirect.com | link-loop | 701
+ www.sciendo.com | | 384
+ www.sciendo.com | success | 363
+ www.sciengine.com | | 225
+ www.scirp.org | | 209
+ www.spandidos-publications.com | | 205
+ www.tandfonline.com | | 8925
+ www.tandfonline.com | blocked-cookie | 8099
+ www.tandfonline.com | terminal-bad-status | 477
+ www.tandfonline.com | redirect-loop | 322
+ www.taylorfrancis.com | | 6119
+ www.taylorfrancis.com | no-pdf-link | 3567
+ www.taylorfrancis.com | link-loop | 2169
+ www.taylorfrancis.com | terminal-bad-status | 353
+ www.thieme-connect.de | | 1047
+ www.thieme-connect.de | redirect-loop | 472
+ www.thieme-connect.de | spn2-error:job-failed | 343
+ www.tib.eu | | 206
+ www.trp.org.in | | 311
+ www.trp.org.in | success | 311
+ www.un-ilibrary.org | no-pdf-link | 597 *
+ www.un-ilibrary.org | | 597
+ www.vr-elibrary.de | | 775
+ www.vr-elibrary.de | blocked-cookie | 774
+ www.wjgnet.com | | 204
+ www.wjgnet.com | no-pdf-link | 204
+ www.worldscientific.com | | 974
+ www.worldscientific.com | blocked-cookie | 971
+ www.worldwidejournals.com | | 242
+ www.worldwidejournals.com | no-pdf-link | 203
+ www.wto-ilibrary.org | no-pdf-link | 295
+ www.wto-ilibrary.org | | 295
+ www.zora.uzh.ch | | 222
+ zenodo.org | | 49460
+ zenodo.org | no-pdf-link | 39721
+ zenodo.org | success | 8954
+ zenodo.org | wrong-mimetype | 562
+ | | 445919
+ | no-pdf-link | 168035
+ | success | 140875
+ | gateway-timeout | 31809
+ | blocked-cookie | 26431
+ | terminal-bad-status | 25625
+ | link-loop | 19006
+ | spn2-error:job-failed | 13962
+ | redirect-loop | 12512
+ | wrong-mimetype | 2302
+ | spn2-error | 1689
+ | too-many-redirects | 1203
+ | bad-redirect | 732
+ | cdx-error | 539
+ | not-found | 420
+ | spn2-error:no-status | 256
+ (419 rows)
+
+Get random subsets by terminal domain:
+
+ \x auto
+ SELECT
+ ingest_request.link_source_id AS link_source_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_file_result.terminal_url LIKE '%//DOMAIN/%'
+ ORDER BY random()
+ LIMIT 5;
+
+## acervus.unicamp.br
+
+Previously flagged as messy (2021-05_daily_improvements.md)
+
+## cas.columbia.edu
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-2ety-qm51
+base_url | https://doi.org/10.7916/d8-2ety-qm51
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-0zf6-d167
+base_url | https://doi.org/10.7916/d8-0zf6-d167
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-k6ha-sn43
+base_url | https://doi.org/10.7916/d8-k6ha-sn43
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-bj6t-eb07
+base_url | https://doi.org/10.7916/d8-bj6t-eb07
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-xjac-j502
+base_url | https://doi.org/10.7916/d8-xjac-j502
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+
+these are not public (loginwalls)
+
+DONE: '/login?TARGET=' as a login wall pattern
+
+## doi.ala.org.au
+
+Previously flagged as dataset repository; datacite metadata is wrong. (2021-05_daily_improvements.md)
+
+NOTE: look at ingesting datasets
+
+## www.isca-speech.org
+
+-[ RECORD 1 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2014-84
+base_url | https://doi.org/10.21437/interspeech.2014-84
+terminal_url | https://www.isca-speech.org/archive/interspeech_2014/li14b_interspeech.html
+-[ RECORD 2 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2004-319
+base_url | https://doi.org/10.21437/interspeech.2004-319
+terminal_url | https://www.isca-speech.org/archive/interspeech_2004/delcroix04_interspeech.html
+-[ RECORD 3 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2006-372
+base_url | https://doi.org/10.21437/interspeech.2006-372
+terminal_url | https://www.isca-speech.org/archive/interspeech_2006/lei06c_interspeech.html
+-[ RECORD 4 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2015-588
+base_url | https://doi.org/10.21437/interspeech.2015-588
+terminal_url | https://www.isca-speech.org/archive/interspeech_2015/polzehl15b_interspeech.html
+-[ RECORD 5 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2006-468
+base_url | https://doi.org/10.21437/interspeech.2006-468
+terminal_url | https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html
+
+Bespoke site. Added rule to sandcrawler.
+
+NOTE: re-ingest/recrawl all isca-speech.org no-pdf-link terminal URLs (fatcat-ingest?)
+
+## www.morressier.com
+
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0002858v
+base_url | https://doi.org/10.1115/1.0002858v
+terminal_url | https://www.morressier.com/article/development-new-single-highdensity-heatflux-gauges-unsteady-heat-transfer-measurements-rotating-transonic-turbine/60f162805d86378f03b49af5
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0003896v
+base_url | https://doi.org/10.1115/1.0003896v
+terminal_url | https://www.morressier.com/article/experimental-investigation-proton-exchange-membrane-fuel-cell-platinum-nafion-along-inplane-direction/60f16d555d86378f03b50038
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0004476v
+base_url | https://doi.org/10.1115/1.0004476v
+terminal_url | https://www.morressier.com/article/effect-air-release-agents-performance-results-fabric-lined-bushings/60f16d585d86378f03b502d5
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0001286v
+base_url | https://doi.org/10.1115/1.0001286v
+terminal_url | https://www.morressier.com/article/development-verification-modelling-practice-cfd-calculations-obtain-current-loads-fpso/60f15d3fe537565438d70ece
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0000315v
+base_url | https://doi.org/10.1115/1.0000315v
+terminal_url | https://www.morressier.com/article/fire-event-analysis-fire-frequency-estimation-japanese-nuclear-power-plant/60f15a6f5d86378f03b43874
+
+Many of these seem to be presentations, as both video and slides. PDFs seem broken though.
+
+NOTE: add to list of interesting rich media to crawl/preserve (video+slides+data)
+
+## www.oecd-ilibrary.org
+
+Paywall (2021-05_daily_improvements.md)
+
+## www.rina.org.uk
+
+-[ RECORD 1 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.ws.2002.10
+base_url | https://doi.org/10.3940/rina.ws.2002.10
+terminal_url | https://www.rina.org.uk/showproducts.html?product=4116
+-[ RECORD 2 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.pass.2003.16
+base_url | https://doi.org/10.3940/rina.pass.2003.16
+terminal_url | https://www.rina.org.uk/showproducts.html?product=3566
+-[ RECORD 3 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.icsotin.2013.15
+base_url | https://doi.org/10.3940/rina.icsotin.2013.15
+terminal_url | https://www.rina.org.uk/showproducts.html?product=8017
+-[ RECORD 4 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.wfa.2010.23
+base_url | https://doi.org/10.3940/rina.wfa.2010.23
+terminal_url | https://www.rina.org.uk/showproducts.html?product=8177
+-[ RECORD 5 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.icsotin15.2015.01
+base_url | https://doi.org/10.3940/rina.icsotin15.2015.01
+terminal_url | https://www.rina.org.uk/showproducts.html?product=7883
+
+Site is broken in some way
+
+## www.sciencedirect.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.jhlste.2021.100332
+base_url | https://doi.org/10.1016/j.jhlste.2021.100332
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S1473837621000332
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.hazadv.2021.100006
+base_url | https://doi.org/10.1016/j.hazadv.2021.100006
+terminal_url | https://www.sciencedirect.com/science/article/pii/S2772416621000061/pdfft?md5=e51bfd495bb53073c7a379d25cb11a32&pid=1-s2.0-S2772416621000061-main.pdf
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/b978-0-12-822844-9.00009-8
+base_url | https://doi.org/10.1016/b978-0-12-822844-9.00009-8
+terminal_url | https://www.sciencedirect.com/science/article/pii/B9780128228449000098
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.colcom.2021.100490
+base_url | https://doi.org/10.1016/j.colcom.2021.100490
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S2215038221001308
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/b978-0-323-85245-6.00012-6
+base_url | https://doi.org/10.1016/b978-0-323-85245-6.00012-6
+terminal_url | https://www.sciencedirect.com/science/article/pii/B9780323852456000126
+
+These no-pdf-url ones seem to just be not OA, which is expected for much of the
+domain.
+
+## repository.dri.ie
+
+ link_source_id | base_url | terminal_url
+-----------------------+---------------------------------------+---------------------------------------------
+ 10.7486/dri.t148v5941 | https://doi.org/10.7486/dri.t148v5941 | https://repository.dri.ie/catalog/t148v5941
+ 10.7486/dri.2z119c98f | https://doi.org/10.7486/dri.2z119c98f | https://repository.dri.ie/catalog/2z119c98f
+ 10.7486/dri.qf8621102 | https://doi.org/10.7486/dri.qf8621102 | https://repository.dri.ie/catalog/qf8621102
+ 10.7486/dri.js95m457t | https://doi.org/10.7486/dri.js95m457t | https://repository.dri.ie/catalog/js95m457t
+ 10.7486/dri.c534vb726 | https://doi.org/10.7486/dri.c534vb726 | https://repository.dri.ie/catalog/c534vb726
+
+"Digital repository of Ireland"
+
+Historical scanned content. Bespoke site. Fixed.
+
+NOTE: recrawl/retry this domain
+
+## www.frontiersin.org
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/978-2-88971-147-5
+base_url | https://doi.org/10.3389/978-2-88971-147-5
+terminal_url | https://www.frontiersin.org/research-topics/9081/neuroimaging-approaches-to-the-study-of-tinnitus-and-hyperacusis
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fnins.2021.722592
+base_url | https://doi.org/10.3389/fnins.2021.722592
+terminal_url | https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fcell.2021.683209
+base_url | https://doi.org/10.3389/fcell.2021.683209
+terminal_url | https://www.frontiersin.org/articles/10.3389/fcell.2021.683209/full
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fmicb.2021.692474
+base_url | https://doi.org/10.3389/fmicb.2021.692474
+terminal_url | https://www.frontiersin.org/articles/10.3389/fmicb.2021.692474/full
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fneur.2021.676527
+base_url | https://doi.org/10.3389/fneur.2021.676527
+terminal_url | https://www.frontiersin.org/articles/10.3389/fneur.2021.676527/full
+
+All the `/research-topics/` URLs are out of scope.
+
+NOTE: recrawl missing frontiersin.org articles for PDFs
+NOTE: recrawl missing frontiersin.org articles for XML (?)
+
+-------
+
+## direct.mit.edu
+
+Previously "not available" (2021-05_daily_improvements.md)
+
+## figshare.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15052236.v6
+base_url | https://doi.org/10.6084/m9.figshare.15052236.v6
+terminal_url | https://figshare.com/articles/software/RCL-tree_rar/15052236/6
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.14907846.v5
+base_url | https://doi.org/10.6084/m9.figshare.14907846.v5
+terminal_url | https://figshare.com/articles/book/Conservation_of_Limestone_Ecosystems_of_Malaysia_Part_I_Acknowledgements_Methodology_Overview_of_limestone_outcrops_in_Malaysia_References_Detailed_information_on_limestone_outcrops_of_the_states_Johor_Negeri_Sembilan_Terengganu_Selangor_Pe/14907846/5
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15157614.v1
+base_url | https://doi.org/10.6084/m9.figshare.15157614.v1
+terminal_url | https://figshare.com/articles/software/code_for_NN-A72265C/15157614/1
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15172926.v1
+base_url | https://doi.org/10.6084/m9.figshare.15172926.v1
+terminal_url | https://figshare.com/articles/preprint/History_of_the_internet/15172926/1
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.16532574.v1
+base_url | https://doi.org/10.6084/m9.figshare.16532574.v1
+terminal_url | https://figshare.com/articles/media/Helen_McConnell_How_many_trees_do_you_think_you_have_planted_/16532574/1
+
+NOTE: can determine from the redirect URL, I guess. This is helpful for ingest!
+Could also potentially correct fatcat release_type using this info.
+
+We seem to be getting the ones we can (eg, papers) just fine
+
+## hkvalidate.perfdrive.com
+
+Should be skipping/bailing on this domain, but not for some reason.
+
+-[ RECORD 1 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac05cc
+base_url | https://doi.org/10.3847/1538-4357/ac05cc
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=1716a049-aeaa-4a89-8f82-bd733adaa2e7&ssb=43981203877&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05cc&ssi=0774dd12-8427-4e27-a2ac-759c8cc2ec0e&ssk=support@shieldsquare.com&ssm=07370915269044035109047683305266&ssn=e69c743cc3d66619f960f924b562160d637e8d7f1b0f-d3bb-44d4-b075ed&sso=75a8bd85-4a097fb40f99bfb9c97b0a4ca0a38fd6d79513a466e82cc7&ssp=92054607321628531005162856888275586&ssq=33809984098158010864140981653938424553916&ssr=MjA3LjI0MS4yMjUuMTM5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 2 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac0429
+base_url | https://doi.org/10.3847/1538-4357/ac0429
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=12bca70d-0af4-4241-9c9b-384befd96a88&ssb=92559232428&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac0429&ssi=cff72ab0-8427-4acd-a0e7-db1b04cf7ce7&ssk=support@shieldsquare.com&ssm=27895673282814430105287068829605&ssn=9af36a8e10efd239c9367a2f31dde500f7455c4d5f45-bf11-4b99-ad29ea&sso=26bd22d2-b23e1bd9558f2fd9ed0768ef1acecb24715d1d463328a229&ssp=16502500621628222613162823304820671&ssq=11469693950387070477339503456478590533604&ssr=MjA3LjI0MS4yMjUuMTYw&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 3 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1149/1945-7111/ac1a85
+base_url | https://doi.org/10.1149/1945-7111/ac1a85
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=b0fef51a-0f44-476e-b951-3341bde6aa67&ssb=84929220393&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.1149%2F1945-7111%2Fac1a85&ssi=48c05577-8427-4421-acd3-735ca29a46e6&ssk=support@shieldsquare.com&ssm=81129482524077974103852241068134&ssn=cf6c261d2b20d518b2ebe57e40ffaec9ab4cd1955dcb-7877-4f5b-bc3b1e&sso=1d196cae-6850f1ed8143e460f2bfbb61a8ae15cfe6b53d3bcdc528ca&ssp=99289867941628195224162819241830491&ssq=16897595632212421273956322948987630170313&ssr=MjA3LjI0MS4yMjUuMjM2&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 4 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.35848/1882-0786/ac1b0d
+base_url | https://doi.org/10.35848/1882-0786/ac1b0d
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=6debdd23-c46b-4b40-b73c-d5540f04454e&ssb=95627212532&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.35848%2F1882-0786%2Fac1b0d&ssi=78b34ff9-8427-4d07-a0db-78a3aa2c7332&ssk=support@shieldsquare.com&ssm=54055111549093989106852695053789&ssn=cb51949e15a02cb99a8d0b57c4d06327b72e8d5c87a8-d006-4ffa-939ffb&sso=1b7fd62d-8107746fe28fca252fd45ffa403937e272bf75b452b68d4a&ssp=77377533171628212164162820021422494&ssq=02679025218797637682252187852000657274192&ssr=MjA3LjI0MS4yMzMuMTIx&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 5 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac05ba
+base_url | https://doi.org/10.3847/1538-4357/ac05ba
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=f127eb3d-6a05-459d-97f2-499715c04b13&ssb=06802230353&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05ba&ssi=8d087719-8427-4046-91fb-5e96af401560&ssk=support@shieldsquare.com&ssm=21056861072205974105064006574997&ssn=d05a73cff6d9af57acd6e2c366e716176752e1164d39-b9a7-408c-837d11&sso=d3f38d1e-a562a19195042d7e471a5e4fab03b6ca16ff1711c7c61804&ssp=68781137401628744693162877909483738&ssq=79454859841502433261398415426689546750534&ssr=MjA3LjI0MS4yMzIuMTg5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+
+Was failing to check against blocklist again at the end of attempts.
+
+Could retry all these to update status, but probably not worth it.
+
+## jov.arvojournals.org
+
+ link_source_id | base_url | terminal_url
+-----------------------+---------------------------------------+-------------------------------------------------------------
+ 10.1167/jov.21.9.1933 | https://doi.org/10.1167/jov.21.9.1933 | https://jov.arvojournals.org/article.aspx?articleid=2777021
+ 10.1167/jov.21.9.2910 | https://doi.org/10.1167/jov.21.9.2910 | https://jov.arvojournals.org/article.aspx?articleid=2777561
+ 10.1167/jov.21.9.1895 | https://doi.org/10.1167/jov.21.9.1895 | https://jov.arvojournals.org/article.aspx?articleid=2777057
+ 10.1167/jov.21.9.2662 | https://doi.org/10.1167/jov.21.9.2662 | https://jov.arvojournals.org/article.aspx?articleid=2777793
+ 10.1167/jov.21.9.2246 | https://doi.org/10.1167/jov.21.9.2246 | https://jov.arvojournals.org/article.aspx?articleid=2777441
+
+These seem to just not be published/available yet.
+
+But they also use watermark.silverchair.com
+
+NOTE: re-crawl (force-retry?) all non-recent papers with fatcat-ingest
+NOTE: for watermark.silverchair.com terminal bad-status, re-crawl from initial URL (base_url) using heritrix
+
+## kiss.kstudy.com
+
+Previously unable to download (2021-05_daily_improvements.md)
+
+## open.library.ubc.ca
+
+ link_source_id | base_url | terminal_url
+--------------------+------------------------------------+----------------------------------------------------------------------------------
+ 10.14288/1.0400664 | https://doi.org/10.14288/1.0400664 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400664
+ 10.14288/1.0401189 | https://doi.org/10.14288/1.0401189 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401189
+ 10.14288/1.0401487 | https://doi.org/10.14288/1.0401487 | https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487
+ 10.14288/1.0400994 | https://doi.org/10.14288/1.0400994 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400994
+ 10.14288/1.0401312 | https://doi.org/10.14288/1.0401312 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401312
+
+Historical newspapers, out of scope?
+
+Video content:
+https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487
+
+Another video: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+
+NOTE: add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+NOTE: handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+
+
+## panor.ru
+
+ link_source_id | base_url | terminal_url
+-------------------------+-----------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 10.33920/med-14-2108-06 | https://doi.org/10.33920/med-14-2108-06 | https://panor.ru/articles/otsenka-dinamiki-pokazateley-morfofunktsionalnykh-kharakteristik-kozhi-upatsientov-s-spr-pod-vliyaniem-kompleksnoy-fototerapii/66351.html
+ 10.33920/nik-02-2105-01 | https://doi.org/10.33920/nik-02-2105-01 | https://panor.ru/articles/innovatsionnost-obrazovatelnykh-tekhnologiy-kak-istoricheski-oposredovannyy-fenomen/65995.html
+ 10.33920/pro-1-2101-10 | https://doi.org/10.33920/pro-1-2101-10 | https://panor.ru/articles/obespechenie-bezopasnosti-na-promyshlennykh-predpriyatiyakh-s-pomoshchyu-sredstv-individualnoy-zashchity/66299.html
+ 10.33920/sel-4-2008-04 | https://doi.org/10.33920/sel-4-2008-04 | https://panor.ru/articles/osobennosti-regulirovaniya-zemelnykh-otnosheniy-na-prigranichnykh-territoriyakh-rossiyskoy-federatsii/66541.html
+ 10.33920/pro-2-2104-03 | https://doi.org/10.33920/pro-2-2104-03 | https://panor.ru/articles/organizatsiya-samorazvivayushchegosya-proizvodstva-v-realnykh-usloviyakh/65054.html
+
+"The full version of the article is available only to subscribers of the journal"
+
+Paywall
+
+## peerj.com
+
+Previously: this is HTML of reviews (2021-05_daily_improvements.md)
+
+NOTE: Should be HTML ingest, possibly special case scope
+
+## publons.com
+
+Previously: this is HTML (2021-05_daily_improvements.md)
+
+NOTE: Should be HTML ingest, possibly special case scope (length of works)
+
+## stm.bookpi.org
+
+ link_source_id | base_url | terminal_url
+-----------------------------+---------------------------------------------+----------------------------------------------------
+ 10.9734/bpi/nfmmr/v7/11547d | https://doi.org/10.9734/bpi/nfmmr/v7/11547d | https://stm.bookpi.org/NFMMR-V7/article/view/3231
+ 10.9734/bpi/ecafs/v1/9773d | https://doi.org/10.9734/bpi/ecafs/v1/9773d | https://stm.bookpi.org/ECAFS-V1/article/view/3096
+ 10.9734/bpi/mpebm/v5/3391f | https://doi.org/10.9734/bpi/mpebm/v5/3391f | https://stm.bookpi.org/MPEBM-V5/article/view/3330
+ 10.9734/bpi/castr/v13/3282f | https://doi.org/10.9734/bpi/castr/v13/3282f | https://stm.bookpi.org/CASTR-V13/article/view/2810
+ 10.9734/bpi/hmms/v13 | https://doi.org/10.9734/bpi/hmms/v13 | https://stm.bookpi.org/HMMS-V13/issue/view/274
+
+These are... just abstracts of articles within a book? Weird. Maybe sketchy? DOIs via Crossref
+
+## www.cabi.org
+
+ link_source_id | base_url | terminal_url
+--------------------------+------------------------------------------+----------------------------------------------------
+ 10.1079/dfb/20133414742 | https://doi.org/10.1079/dfb/20133414742 | https://www.cabi.org/cabreviews/review/20133414742
+ 10.1079/dmpd/20056500471 | https://doi.org/10.1079/dmpd/20056500471 | https://www.cabi.org/cabreviews/review/20056500471
+ 10.1079/dmpp/20056600544 | https://doi.org/10.1079/dmpp/20056600544 | https://www.cabi.org/cabreviews/review/20056600544
+ 10.1079/dmpd/20056500117 | https://doi.org/10.1079/dmpd/20056500117 | https://www.cabi.org/cabreviews/review/20056500117
+ 10.1079/dmpp20056600337 | https://doi.org/10.1079/dmpp20056600337 | https://www.cabi.org/cabreviews/review/20056600337
+
+Reviews? but just abstracts?
+
+## www.cureus.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17547
+base_url | https://doi.org/10.7759/cureus.17547
+terminal_url | https://www.cureus.com/articles/69542-tramadol-induced-jerks
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.16867
+base_url | https://doi.org/10.7759/cureus.16867
+terminal_url | https://www.cureus.com/articles/66793-advanced-squamous-cell-carcinoma-of-gall-bladder-masquerading-as-liver-abscess-with-review-of-literature-review-on-advanced-biliary-tract-cancer
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17425
+base_url | https://doi.org/10.7759/cureus.17425
+terminal_url | https://www.cureus.com/articles/67438-attitudes-and-knowledge-of-medical-students-towards-healthcare-for-lesbian-gay-bisexual-and-transgender-seniors-impact-of-a-case-based-discussion-with-facilitators-from-the-community
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17313
+base_url | https://doi.org/10.7759/cureus.17313
+terminal_url | https://www.cureus.com/articles/67258-utilizing-google-trends-to-track-online-interest-in-elective-hand-surgery-during-the-covid-19-pandemic
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.16943
+base_url | https://doi.org/10.7759/cureus.16943
+terminal_url | https://www.cureus.com/articles/19364-small-bowel-obstruction-a-rare-presentation-of-the-inferior-pancreaticoduodenal-artery-pseudoaneurysm-bleed
+
+Ugh, stupid "email to get PDF". but ingest seems to work anyways?
+
+NOTE: re-crawl/re-ingest all (eg, fatcat-ingest or similar)
+
+## www.e-manuscripta.ch
+
+ link_source_id | base_url | terminal_url
+------------------------------+----------------------------------------------+-------------------------------------------------------------------
+ 10.7891/e-manuscripta-114031 | https://doi.org/10.7891/e-manuscripta-114031 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114031
+ 10.7891/e-manuscripta-112064 | https://doi.org/10.7891/e-manuscripta-112064 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112064
+ 10.7891/e-manuscripta-112176 | https://doi.org/10.7891/e-manuscripta-112176 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176
+ 10.7891/e-manuscripta-115200 | https://doi.org/10.7891/e-manuscripta-115200 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-115200
+ 10.7891/e-manuscripta-114008 | https://doi.org/10.7891/e-manuscripta-114008 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114008
+
+Historical docs, single pages, but do have full PDF downloads.
+
+NOTE: re-ingest
+
+## www.inderscience.com
+
+Previously: paywall (2021-05_daily_improvements.md)
+
+## www.un-ilibrary.org
+
+ link_source_id | base_url | terminal_url
+----------------------------+--------------------------------------------+-------------------------------------------------------------
+ 10.18356/9789210550307 | https://doi.org/10.18356/9789210550307 | https://www.un-ilibrary.org/content/books/9789210550307
+ 10.18356/9789210586719c011 | https://doi.org/10.18356/9789210586719c011 | https://www.un-ilibrary.org/content/books/9789210586719c011
+ 10.18356/9789210058575c014 | https://doi.org/10.18356/9789210058575c014 | https://www.un-ilibrary.org/content/books/9789210058575c014
+ 10.18356/9789210550307c020 | https://doi.org/10.18356/9789210550307c020 | https://www.un-ilibrary.org/content/books/9789210550307c020
+ 10.18356/9789213631423c005 | https://doi.org/10.18356/9789213631423c005 | https://www.un-ilibrary.org/content/books/9789213631423c005
+
+Books and chapters. Doesn't seem to have actual download ability?
+
+# Re-Ingest / Re-Crawl
+
+Using fatcat-ingest helper tool.
+
+- www.isca-speech.org doi_prefix:10.21437
+ doi:* doi_prefix:10.21437 in_ia:false
+ 9,233
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.21437' > /srv/fatcat/tasks/2021-09-03_ingest_isca.json
+ => Counter({'ingest_request': 9221, 'elasticsearch_release': 9221, 'estimate': 9221})
+- repository.dri.ie doi_prefix:10.7486
+ doi:* in_ia:false doi_prefix:10.7486
+ 56,532
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.7486' > /srv/fatcat/tasks/2021-09-03_ingest_dri.json
+ => Counter({'ingest_request': 56532, 'elasticsearch_release': 56532, 'estimate': 56532})
+- *.arvojournals.org doi_prefix:10.1167 (force recrawl if no-pdf-link)
+ 25,598
+ many are meeting abstracts
+ ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.1167 > /srv/fatcat/tasks/2021-09-03_ingest_arvo.json
+ => Counter({'ingest_request': 25598, 'elasticsearch_release': 25598, 'estimate': 25598})
+- www.cureus.com doi_prefix:10.7759
+ 1,537
+ ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.7759 > /srv/fatcat/tasks/2021-09-03_ingest_cureus.json
+ => Counter({'ingest_request': 1535, 'elasticsearch_release': 1535, 'estimate': 1535})
+- www.e-manuscripta.ch doi_prefix:10.7891 10.7891/e-manuscripta
+ 110,945
+ TODO: all are marked 'unpublished', but that is actually probably right?
+- www.frontiersin.org doi_prefix:10.3389 (both PDF and XML!)
+ doi:* in_ia:false doi_prefix:10.3389
+ 212,370
+ doi:10.3389/conf.* => most seem to be just abstracts? how many like this?
+ container_id:kecnf6vtpngn7j2avgfpdyw5ym => "topics" (2.2k)
+ fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz
+ => 191k
+ but many might be components? this is actually kind of a mess
+ fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz
+ => 19.2k
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' | rg -v 10.3389/conf > /srv/fatcat/tasks/2021-09-03_frontiers.json
+
+# Remaining Tasks / Domains (TODO)
+
+more complex crawling/content:
+- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url
+- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data)
+- doi.ala.org.au: possible dataset ingest source
+- peerj.com, at least reviews, should be HTML ingest? or are some PDF?
+- publons.com should be HTML ingest, possibly special case for scope
+- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug
+
+other tasks:
+- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+- push/deploy sandcrawler changes
diff --git a/notes/ingest/2021-09-03_patch_crawl.md b/notes/ingest/2021-09-03_patch_crawl.md
new file mode 100644
index 0000000..d36f427
--- /dev/null
+++ b/notes/ingest/2021-09-03_patch_crawl.md
@@ -0,0 +1,678 @@
+
+Going to run a combined crawl for `no-capture`, `no-pdf-link` and similar URL
+statuses.
+
+As a reminder, significant refactor of PDF URL extraction happened around
+Oct/Nov 2020, so things not re-ingested since then should be retried.
+
+1. first bulk re-process `no-pdf-link` statuses from OAI-PMH crawl past OA DOI past crawls
+2. then heritrix crawl of old URLs from all sources (see status codes below)
+3. bulk ingest specific sources and statuses (see below)
+
+Status codes to crawl, with potentially split separate batches:
+
+ no-capture
+ IA errors
+ cdx-error
+ wayback-error
+ wayback-content-error
+ petabox-error
+ spn2-cdx-lookup-failure
+ gateway-timeout
+
+Then, bulk ingest from these sources matching the above patterns, in this order:
+
+- OA DOI (fatcat-ingest or fatcat-changelog source; will result in import)
+- unpaywall (will result in import)
+- OAI-PMH
+- MAG
+
+Current combined domain skip list (SQL filter syntax), for which we don't want
+to bother retrying:
+
+ '%journals.sagepub.com%'
+ '%pubs.acs.org%'
+ '%ahajournals.org%'
+ '%www.journal.csj.jp%'
+ '%aip.scitation.org%'
+ '%academic.oup.com%'
+ '%tandfonline.com%'
+ '%://orcid.org/%'
+ '%://doaj.org/%'
+ '%://archive.org/%'
+ '%://web.archive.org/%'
+ '%://www.archive.org/%'
+
+## DOI Ingest Status (2021-09-08)
+
+Recently did some analysis of OAI-PMH overall status, so can re-do comparisons
+there easily. What about overall DOI ingest? Would like counts so we can
+compare before/after.
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------+----------
+ no-pdf-link | 10516478
+ success | 5690862
+ redirect-loop | 1827192
+ no-capture | 1215179
+ terminal-bad-status | 650104
+ link-loop | 610251
+ blocked-cookie | 353681
+ gateway-timeout | 341319
+ too-many-redirects | 307895
+ forbidden | 306710
+ spn2-cdx-lookup-failure | 282955
+ not-found | 273667
+ cdx-error | 269082
+ skip-url-blocklist | 265689
+ spn2-error | 87759
+ wrong-mimetype | 68993
+ spn2-error:too-many-redirects | 58064
+ wayback-error | 54152
+ spn2-wayback-error | 51752
+ remote-server-error | 45683
+ (20 rows)
+
+## `no-pdf-link` re-try bulk ingest
+
+Specifically for past OAI-PMH and OA DOI crawls.
+
+What are top terminal domains that would be retried? So that we can filter out
+large ones we don't want to bother retrying.
+
+ SELECT domain, COUNT(domain)
+ FROM (
+ SELECT
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND (
+ ingest_request.link_source = 'oai'
+ OR (
+ ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ )
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY domain
+ ORDER BY COUNT DESC
+ LIMIT 40;
+
+ domain | count
+ ---------------------------------------+--------
+ ssl.fao.org | 862277
+ www.e-periodica.ch | 828110
+ zenodo.org | 686701
+ plutof.ut.ee | 685440
+ www.gbif.org | 669727
+ dlc.library.columbia.edu | 536018
+ figshare.com | 383181
+ juser.fz-juelich.de | 351519
+ statisticaldatasets.data-planet.com | 320415
+ espace.library.uq.edu.au | 310767
+ invenio.nusl.cz | 309731
+ doi.pangaea.de | 306311
+ igi.indrastra.com | 297872
+ bib-pubdb1.desy.de | 273565
+ t2r2.star.titech.ac.jp | 271907
+ digi.ub.uni-heidelberg.de | 265519
+ www.sciencedirect.com | 263847
+ publikationen.bibliothek.kit.edu | 229960
+ www.plate-archive.org | 209231
+ www.degruyter.com | 189776
+ spectradspace.lib.imperial.ac.uk:8443 | 187086
+ hal.archives-ouvertes.fr | 185513
+ open.library.ubc.ca | 172821
+ lup.lub.lu.se | 170063
+ books.openedition.org | 169501
+ orbi.uliege.be | 161443
+ freidok.uni-freiburg.de | 150310
+ library.wur.nl | 124318
+ digital.library.pitt.edu | 116406
+ www.research.manchester.ac.uk | 115869
+ www.bibliotecavirtualdeandalucia.es | 114527
+ repository.tue.nl | 112157
+ www.google.com | 111569
+ easy.dans.knaw.nl | 109608
+ springernature.figshare.com | 108597
+ nbn-resolving.org | 107544
+ scholarbank.nus.edu.sg | 107299
+ bibliotecavirtualdefensa.es | 105501
+ biblio.ugent.be | 100854
+ ruj.uj.edu.pl | 99500
+ (40 rows)
+
+For a number of these domains, we do not expect any PDFs to be found, but are
+going to re-ingest anyways so they get marked as 'blocked-*' in result table:
+
+- ssl.fao.org
+- plutof.ut.ee
+- www.gbif.org
+
+But some we are just going to skip anyways, because there *could* be PDFs, but
+probably *aren't*:
+
+- zenodo.org
+- t2r2.star.titech.ac.jp
+- www.google.com
+- figshare.com
+- springernature.figshare.com
+
+Dump ingest requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND (
+ ingest_request.link_source = 'oai'
+ OR (
+ ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ )
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.rows.json';
+ => COPY 18040676
+
+Transform and start ingest:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.ingest_request.json
+ => 18.0M 0:06:45 [44.5k/s]
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ => DONE
+
+## Progress Check
+
+OAI-PMH query:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 13258356
+ no-pdf-link | 8685519
+ no-capture | 4765663
+ redirect-loop | 1557731
+ terminal-bad-status | 803373
+ link-loop | 453999
+ wrong-mimetype | 440230
+ null-body | 71457
+ cdx-error | 18426
+ | 15275
+ petabox-error | 13408
+ wayback-error | 11845
+ blocked-cookie | 11580
+ skip-url-blocklist | 7761
+ wayback-content-error | 383
+ spn2-cdx-lookup-failure | 362
+ gateway-timeout | 320
+ body-too-large | 207
+ spn2-error:job-failed | 191
+ redirects-exceeded | 120
+ (20 rows)
+
+OAI-PMH compared to a couple weeks ago:
+
+ 13258356-12872279 = +386,077 success
+ 8685519-9329602 = -644,083 no-pdf-link
+ 4765663-4696362 = +69,301 no-capture
+ 803373-660418 = +142,955 terminal-bad-status
+
+OA DOI ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------------+---------
+ no-pdf-link | 6693547
+ success | 5979016
+ skip-url-blocklist | 3080986
+ no-capture | 1876914
+ redirect-loop | 1872817
+ terminal-bad-status | 656674
+ link-loop | 624290
+ blocked-cookie | 448001
+ gateway-timeout | 351896
+ too-many-redirects | 307895
+ forbidden | 306710
+ spn2-cdx-lookup-failure | 301312
+ cdx-error | 279766
+ not-found | 273667
+ wrong-mimetype | 83289
+ spn2-error | 76806
+ spn2-error:too-many-redirects | 58064
+ wayback-error | 54278
+ spn2-wayback-error | 51768
+ remote-server-error | 45683
+ (20 rows)
+
+OA DOI changes:
+
+ 5979016-5690862 = +288,154 success
+ 6693547-10516478 = -3,822,931 no-pdf-link (still many!)
+ 1876914-1215179 = +661,735 no-capture
+ 3080986-265689 = +2,815,297 skip-url-blocklist
+
+Overall about half a million new 'success', pretty good. over 750k new
+no-capture for crawling.
+
+## Seedlist Dumps
+
+Note that this is just seedlists, not full ingest requests.
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ )
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ ) TO '/srv/sandcrawler/tasks/patch_2021-09-16_terminal_seedlist.txt';
+ => 6,354,365
+
+Then run the actual patch crawl!
+
+## Ingest Requests for Bulk Retry (2022-01-06)
+
+Crawl has just about completed, so running another round of bulk ingest
+requests, slightly updated to allow `https://doi.org/10*` in terminal URL:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.updated <= '2022-01-01'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR (
+ ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ )
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json';
+ => 4,488,193
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json
+ => DONE
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => TIMEDOUT
+ => (probably due to re-assignment)
+ => DONE
+
+## Stats Again (just OAI-PMH)
+
+OAI-PMH query:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+On 2022-02-08:
+
+ status | count
+ -----------------------+----------
+ success | 13505143
+ no-pdf-link | 8741007
+ no-capture | 4429986
+ redirect-loop | 1566611
+ terminal-bad-status | 816162
+ link-loop | 459006
+ wrong-mimetype | 448983
+ null-body | 71871
+ cdx-error | 19055
+ | 15275
+ petabox-error | 11713
+ blocked-cookie | 11664
+ wayback-error | 8745
+ skip-url-blocklist | 7828
+ max-hops-exceeded | 2031
+ wayback-content-error | 338
+ body-too-large | 280
+ spn2-error:job-failed | 191
+ bad-redirect | 134
+ redirects-exceeded | 120
+ (20 rows)
+
+
+On 2022-02-28, after bulk ingest completed:
+
+ status | count
+ -----------------------+----------
+ success | 14668123
+ no-pdf-link | 8822460
+ no-capture | 2987565
+ redirect-loop | 1629015
+ terminal-bad-status | 917851
+ wrong-mimetype | 466512
+ link-loop | 460941
+ null-body | 71457
+ cdx-error | 19636
+ petabox-error | 16198
+ | 15275
+ blocked-cookie | 11885
+ wayback-error | 8779
+ skip-url-blocklist | 7838
+ empty-blob | 5906
+ max-hops-exceeded | 5563
+ wayback-content-error | 355
+ body-too-large | 329
+ spn2-error:job-failed | 191
+ bad-redirect | 137
+ (20 rows)
+
+
+Comparing to a couple months ago:
+
+ 14668123-13258356 = +1,409,767 success
+ 8822460-8685519 = + 136,941 no-pdf-link
+ 2987565-4765663 = -1,778,098 no-capture
+ 917851-803373 = + 114,478 terminal-bad-status
+
diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md
new file mode 100644
index 0000000..786c3b2
--- /dev/null
+++ b/notes/ingest/2021-12-13_datasets.md
@@ -0,0 +1,504 @@
+
+First round of production dataset ingest. Aiming to get one or two small
+repositories entirely covered, and a few thousand datasets from all supported
+platforms.
+
+Planning to run with sandcrawler in batch mode on `wbgrp-svc263`, expecting up
+to a TByte of content locally (on spinning disk). For successful output, will
+run through fatcat import; for a subset of unsuccessful, will start a small
+heritrix crawl.
+
+
+## Ingest Generation
+
+Summary:
+
+ wc -l /srv/fatcat/tasks/ingest_dataset_*pilot.json
+ 2 /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json
+ 1702 /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json
+ 2975 /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json
+ 10000 /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json
+ 10000 /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json
+
+All the below ingest requests were combined into a single large file:
+
+ cat /srv/fatcat/tasks/ingest_dataset*pilot.json | shuf | pv -l | gzip > /srv/fatcat/tasks/ingest_dataset_combined.json.gz
+ # 24.7k 0:00:00 [91.9k/s]
+
+### Figshare
+
+- sample 10k datasets (not other types)
+- want only "versioned" DOIs; use regex on DOI to ensure
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.6084 type:dataset' \
+ | rg '10\.6084/m9\.figshare\.\d+.v\d+' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json
+ # Counter({'estimate': 505968, 'ingest_request': 50000, 'elasticsearch_release': 50000})
+
+### Zenodo
+
+- has DOIs (of course)
+- want only "versioned" DOIs? how to skip?
+- sample 10k
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.5281 type:dataset' \
+ | rg '10\.5281/zenodo' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json
+
+### Goettingen Research Online
+
+- <https://data.goettingen-research-online.de/>
+- Dataverse instance, not harvard-hosted
+- ~1,400 datasets, ~10,500 files
+- has DOIs
+- `doi_prefix:10.25625`, then filter to only one slash
+
+ ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query 'doi_prefix:10.25625 type:dataset' \
+ | rg -v '10\.25625/[a-z0-9]+/[a-z0-9]' \
+ | shuf \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json
+ # Counter({'ingest_request': 12739, 'elasticsearch_release': 12739, 'estimate': 12739}) # 1.7k 0:01:29 [ 19 /s]
+
+### Harvard Dataverse
+
+- main harvard dataverse instance, many "sub-dataverses"
+- ~137,000 datasets, ~1,400,000 files
+- 10k sample
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.7910 type:dataset' \
+ | rg '10\.7910/dvn/[a-z0-9]{6}' \
+ | rg -v '10\.7910/dvn/[a-z0-9]{6}/[a-z0-9]' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json
+ # Counter({'estimate': 660979, 'ingest_request': 50000, 'elasticsearch_release': 50000}) # 2.97k 0:03:26 [14.4 /s]
+
+Note that this was fewer than expected, but moving on anyways.
+
+### archive.org
+
+A couple hand-filtered items.
+
+"CAT" dataset
+- item: <https://archive.org/details/CAT_DATASET>
+- fatcat release (for paper): `release_36vy7s5gtba67fmyxlmijpsaui`
+
+"The Representativeness of Automated Web Crawls as a Surrogate for Human Browsing"
+- https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62
+- https://fatcat.wiki/release/7owybd2hrvdmdpm4zpo7hkn2pu (paper)
+
+
+ {
+ "ingest_type": "dataset",
+ "ingest_request_source": "savepapernow",
+ "base_url": "https://archive.org/details/CAT_DATASET",
+ "release_stage": "published",
+ "fatcat": {
+ "release_ident": "36vy7s5gtba67fmyxlmijpsaui",
+ "work_ident": "ycqtbhnfmzamheq2amztiwbsri"
+ },
+ "ext_ids": {},
+ "link_source": "spn",
+ "link_source_id": "36vy7s5gtba67fmyxlmijpsaui"
+ }
+ {
+ "ingest_type": "dataset",
+ "ingest_request_source": "savepapernow",
+ "base_url": "https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62",
+ "release_stage": "published",
+ "fatcat": {
+ "release_ident": "7owybd2hrvdmdpm4zpo7hkn2pu",
+ "work_ident": "3xkz7iffwbdfhbwhnd73iu66cu"
+ },
+ "ext_ids": {},
+ "link_source": "spn",
+ "link_source_id": "7owybd2hrvdmdpm4zpo7hkn2pu"
+ }
+
+ # paste and then Ctrl-D:
+ cat | jq . -c > /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json
+
+
+## Ingest Command
+
+On `wbgrp-svc263`.
+
+In the current version of tool, `skip_cleanup_local_files=True` by default, so
+files will stick around.
+
+Note that `--no-spn2` is passed, so we are expecting a lot of `no-capture` in the output.
+
+
+ # first a small sample
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | head -n5 \
+ | pv -l \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.ramp.json
+
+ # ok, run the whole batch through
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | pv -l \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.json
+
+Got an error:
+
+ internetarchive.exceptions.AuthenticationError: No access_key or secret_key set! Have you run `ia configure`?
+
+Did a hot patch to try to have the uploads happen under a session, with config from ENV, but didn't work:
+
+ AttributeError: 'ArchiveSession' object has no attribute 'upload'
+
+Going to hack with config in homedir for now.
+
+Extract URLs for crawling:
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \
+ | rg '"no-capture"' \
+ | rg -v '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request.base_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist.base_url.txt
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \
+ | rg '"no-capture"' \
+ | rg '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | rg '"web-' \
+ | jq .manifest[].terminal_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist.manifest_terminal.txt
+
+### Exceptions Encountered
+
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 193, in process
+ internetarchive.upload
+ [...]
+ ConnectionResetError: [Errno 104] Connection reset by peer
+ urllib3.exceptions.ProtocolError
+ requests.exceptions.ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')), 'https://s3.us.archive.org/zenodo.org-3275525/rhOverM_Asymptotic_GeometricUnits_CoM.h5')
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 375, in process
+ archive_result = strategy_helper.process(dataset_meta)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 130, in process
+ r.raise_for_status()
+ File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/requests/models.py", line 953, in raise_for_status
+ raise HTTPError(http_error_msg, response=self)
+ requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://ndownloader.figshare.com/files/5474201
+
+download sometimes just slowly time out, like after a day or more
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 381, in process
+ archive_result = strategy_helper.process(dataset_meta)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 155, in process
+ file_meta = gen_file_metadata_path(local_path, allow_empty=True)
+ File "/srv/sandcrawler/src/python/sandcrawler/misc.py", line 89, in gen_file_metadata_path
+ mimetype = magic.Magic(mime=True).from_file(path)
+ File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/magic/__init__.py", line 111, in from_file
+ with _real_open(filename):
+ FileNotFoundError: [Errno 2] No such file or directory: '/tmp/sandcrawler/figshare.com-7925396-v1/HG02070.dedup.realigned.recalibrated.hc.g.vcf.gz'
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 314, in process
+ dataset_meta = platform_helper.process_request(request, resource, html_biblio)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_platforms.py", line 208, in process_request
+ obj_latest = obj["data"]["latestVersion"]
+ KeyError: 'latestVersion'
+
+Fixed the above, trying again:
+
+ git log | head -n1
+ # commit ffdc901fa067db55fe6cfeb8d0c3807d29df092c
+
+ Wed Dec 15 21:57:42 UTC 2021
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | shuf \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results4.json
+
+Zenodo seems really slow, let's try filtering those out:
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | rg -v 10.5281 \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results5.json
+ # 3.76k 15:12:53 [68.7m/s]
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | rg -v 10.5281 \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results6.json
+
+## Fatcat Import
+
+ wc -l ingest_dataset_combined_results*.json
+ 126 ingest_dataset_combined_results2.json
+ 153 ingest_dataset_combined_results3.json
+ 275 ingest_dataset_combined_results4.json
+ 3762 ingest_dataset_combined_results5.json
+ 7736 ingest_dataset_combined_results6.json
+ 182 ingest_dataset_combined_results.json
+ 5 ingest_dataset_combined_results.ramp.json
+ 12239 total
+
+ cat ingest_dataset_combined_results*.json \
+ | rg '^\{' \
+ | jq '[.request.fatcat.release_ident, . | tostring] | @tsv' -r \
+ | sort \
+ | uniq --check-chars 26 \
+ | cut -f2 \
+ | rg -v '\\\\' \
+ | pv -l \
+ > uniq_ingest_dataset_combined_results.json
+ # 9.48k 0:00:06 [1.54k/s]
+
+ cat uniq_ingest_dataset_combined_results.json | jq .status -r | sort | uniq -c | sort -nr
+ 7941 no-capture
+ 374 platform-404
+ 369 terminal-bad-status
+ 348 success-file
+ 172 success
+ 79 platform-scope
+ 77 error-platform-download
+ 47 empty-manifest
+ 27 platform-restricted
+ 20 too-many-files
+ 12 redirect-loop
+ 6 error-archiveorg-upload
+ 3 too-large-size
+ 3 mismatch
+ 1 no-platform-match
+
+ cat uniq_ingest_dataset_combined_results.json \
+ | rg '"success' \
+ | jq 'select(.status == "success") | .' -c \
+ > uniq_ingest_dataset_combined_results.success.json
+
+ cat uniq_ingest_dataset_combined_results.json \
+ | rg '"success' \
+ | jq 'select(.status == "success-file") | .' -c \
+ > uniq_ingest_dataset_combined_results.success-file.json
+
+On fatcat QA instance:
+
+ git log | head -n1
+ # commit cca680e2cc4768a4d45e199f6256a433b25b4075
+
+ head /tmp/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /tmp/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-file-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-ingest-type': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+Need to update fatcat file worker to support single-file filesets... was that the plan?
+
+ head /tmp/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-no-access-url': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0})
+
+Trying again 2022-03-23:
+
+ git log | head -n1
+ # commit 134cb050988be2c545af89e0a67c4998307bb819
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-file-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-status': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'exists': 10, 'skip': 0, 'insert': 0, 'update': 0})
+
+ head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 30, 'skip': 20, 'skip-release-has-fileset': 20, 'exists': 10, 'insert': 0, 'update': 0})
+
+ head -n200 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 172, 'skip': 162, 'skip-release-has-fileset': 162, 'exists': 10, 'insert': 0, 'update': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-file-results -
+ # Counter({'total': 10, 'insert': 8, 'skip': 2, 'skip-bad-hashes': 2, 'update': 0, 'exists': 0})
+
+Fixed a small logic error in insert path.
+
+ head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 30, 'insert': 20, 'exists': 10, 'skip': 0, 'update': 0})
+
+archive.org datasets are *not* getting uploaded with the correct path. path
+directory prefixes are getting clobbered.
+
+## Summary
+
+As follow-up, it may be worth doing another manual round of ingest requests.
+After that, would be good to fill in "glue" code so that this can be done with
+kafka workers, and do re-tries/dumps using sandcrawler SQL database. Then can
+start scaling up more ingest, using ingest tool, "bulk mode" processing,
+heritrix crawls from `no-capture` dumps, etc, similar to bulk file ingest
+process.
+
+For scaling, let's do a "full" ingest request generation of all datasets, and
+crawl the base URL with heritrix, in fast/direct mode. Expect this to be tens
+of millions of mostly DOIs (doi.org URLs), should crawl quickly.
+
+Then, do bulk downloading with ingest worker, perhaps on misc-vm or aitio.
+uploading large datasets to archive.org, but not doing SPN web requests. Feed
+the resulting huge file seedlist into a heritrix crawl to download web files.
+
+Will need to add support for more specific platforms.
+
+
+### Huge Bulk Ingest Prep
+
+On prod instance:
+
+ ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query type:dataset \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_dataset_bulk.2022-01-05.json.gz
+ # Expecting 11264787 release objects in search queries
+ # TIMEOUT ERROR
+ # 6.07M 19:13:02 [87.7 /s] (partial)
+
+As follow-up, should do a full batch (not partial). For now search index is too
+unreliable (read timeouts).
+
+ zcat ingest_dataset_bulk.2022-01-05.partial.json.gz \
+ | jq .base_url -r \
+ | sort -u \
+ | shuf \
+ | awk '{print "F+ " $1}' \
+ > ingest_dataset_bulk.2022-01-05.partial.schedule
+
+## Retries (2022-01-12)
+
+This is after having done a bunch of crawling.
+
+ cat ingest_dataset_combined_results6.json \
+ | rg '"no-capture"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request -c \
+ | pv -l \
+ > ingest_dataset_retry.json
+ => 6.51k 0:00:01 [3.55k/s]
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_retry.json \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_retry_results.json
+
+## Retries (2022-02)
+
+Finally got things to complete end to end for this batch!
+
+ cat ingest_dataset_retry_results5.json | jq .status -r | sort | uniq -c | sort -nr
+ 3220 terminal-bad-status
+ 2120 no-capture
+ 380 empty-manifest
+ 264 success-file
+ 251 success
+ 126 success-existing
+ 39 mismatch
+ 28 error-platform-download
+ 24 too-many-files
+ 20 platform-scope
+ 13 platform-restricted
+ 13 mismatch-size
+ 6 too-large-size
+ 3 transfer-encoding-error
+ 2 no-platform-match
+ 2 error-archiveorg-upload
+ 1 redirect-loop
+ 1 empty-blob
+
+Some more URLs to crawl:
+
+ cat ingest_dataset_retry_results5.json \
+ | rg '"no-capture"' \
+ | rg -v '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request.base_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.base_url.txt
+ # 1.00
+ # just a single DOI that failed to crawl, for whatever reason
+
+ cat ingest_dataset_retry_results5.json \
+ | rg '"no-capture"' \
+ | rg '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | rg '"web-' \
+ | jq .manifest[].terminal_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt
+
+These are ready to crawl, in the existing dataset crawl.
+
+ cat /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt \
+ | sort -u \
+ | shuf \
+ | awk '{print "F+ " $1}' \
+ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule
+
+## Running Uploads Again
+
+Looks like the temporary download files got wiped on `wbgrp-svc263`. This is a
+big bummer! Will need to download many of these over again.
+
+ # sandcrawler git: c69a8dadb0426fec10fe38474c2f37ceaebdf316
+ # skip_cleanup_local_files=True is still default
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py --enable-sentry requests --no-spn2 - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.2022-04-04.json
+
+ # filter out zenodo, very slow:
+ # rg -v 10.5281 \
diff --git a/notes/ingest/2022-01-06_patch_crawl.md b/notes/ingest/2022-01-06_patch_crawl.md
new file mode 100644
index 0000000..941519f
--- /dev/null
+++ b/notes/ingest/2022-01-06_patch_crawl.md
@@ -0,0 +1,398 @@
+
+Starting another paper fulltext patch crawl, targetting recent OA content which
+has failed to ingest, and platforms (arxiv, etc).
+
+Specifically:
+
+- "daily" changelog ingest requests from all time, which failed with various status codes
+- pdf no-capture
+- SPN errors
+- terminal-bad-status with 5xx, 429
+- gateway-timeout
+- html no-capture
+- html-resource-no-capture
+
+Most of these are dumped in a single complex query (below),
+
+TODO: html-resource-no-capture (from error message? or do SPN requests separately?)
+
+
+## Initial 'no-capture' Seedlist
+
+Dump terminal URLs (will do ingest requests later, using similar command):
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ -- SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-12.rows.json';
+ ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-01-12.txt';
+ => COPY 6389683
+
+TODO: filter out archive.org/www.archive.org
+
+ cat patch_terminal_url.2022-01-12.txt \
+ | rg -v www.archive.org \
+ | rg '://' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | rg -i '^http' \
+ | sort -u -S 4G \
+ | pv -l \
+ > patch_terminal_url.2022-01-12.uniq.txt
+ => 5.73M 0:00:47 [ 120k/s]
+
+ # note: tweaks and re-ran the above after inspecting this output
+ cut -f3 -d/ patch_terminal_url.2022-01-12.uniq.txt | sort | uniq -c | sort -nr | head -n25
+ 799045 doi.org
+ 317557 linkinghub.elsevier.com
+ 211091 arxiv.org
+ 204334 iopscience.iop.org
+ 139758 dialnet.unirioja.es
+ 130331 www.scielo.br
+ 124626 www.persee.fr
+ 85764 digitalrepository.unm.edu
+ 83913 www.mdpi.com
+ 79662 www.degruyter.com
+ 75703 www.e-periodica.ch
+ 72206 dx.doi.org
+ 69068 escholarship.org
+ 67848 idus.us.es
+ 57907 zenodo.org
+ 56624 ir.opt.ac.cn
+ 54983 projecteuclid.org
+ 52226 rep.bntu.by
+ 48376 osf.io
+ 48009 pubs.rsc.org
+ 46947 publikationen.ub.uni-frankfurt.de
+ 45564 www.research-collection.ethz.ch
+ 45153 dk.um.si
+ 43313 www.ssoar.info
+ 40543 scholarworks.umt.edu
+
+TODO: cleanup ingest request table in sandcrawler-db:
+- remove filtered OAI-PMH prefixes
+- remove any invalid `base_url` (?)
+
+## More Seedlist (2022-02-08)
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ -- SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-08.rows.json';
+ ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-02-08.txt';
+ => COPY 444764
+
+ cat patch_terminal_url.2022-02-08.txt \
+ | rg -v www.archive.org \
+ | rg '://' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | rg -i '^http' \
+ | sort -u -S 4G \
+ | pv -l \
+ > patch_terminal_url.2022-02-08.uniq.txt
+ => 426k 0:00:04 [ 103k/s]
+
+ cut -f3 -d/ patch_terminal_url.2022-02-08.uniq.txt | sort | uniq -c | sort -nr | head -n25
+ 60123 www.degruyter.com
+ 59314 arxiv.org
+ 43674 zenodo.org
+ 17771 doi.org
+ 9501 linkinghub.elsevier.com
+ 9379 www.mdpi.com
+ 5691 opendata.uni-halle.de
+ 5578 scholarlypublishingcollective.org
+ 5451 era.library.ualberta.ca
+ 4982 www.cairn.info
+ 4306 www.taylorfrancis.com
+ 4189 papers.ssrn.com
+ 4157 apps.crossref.org
+ 4089 www.sciencedirect.com
+ 4033 mdpi-res.com
+ 3763 dlc.mpg.de
+ 3408 osf.io
+ 2603 www.frontiersin.org
+ 2594 watermark.silverchair.com
+ 2569 journals.lww.com
+ 1787 underline.io
+ 1680 archiviostorico.fondazione1563.it
+ 1658 www.jstage.jst.go.jp
+ 1611 cyberleninka.ru
+ 1535 www.schoeningh.de
+
+ cat patch_terminal_url.2022-02-08.txt | awk '{print "F+ " $1}' > patch_terminal_url.2022-02-08.schedule
+ => Done
+
+Copied to crawler svc206 and added to frontier.
+
+
+## Bulk Ingest Requests (2022-02-28)
+
+Note that we are skipping OAI-PMH here, because we just did a separate ingest
+for those.
+
+This is going to dump many duplicate lines (same `base_url`, multiple
+requests), but that is fine. Expecting something like 7 million rows.
+
+ COPY (
+ -- SELECT ingest_file_result.terminal_url
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND ingest_file_result.updated <= '2022-02-08'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ -- ingest_request.link_source = 'oai'
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json';
+ # COPY 3053219
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json
+ => DONE
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
diff --git a/notes/ingest/2022-01-13_doi_crawl.md b/notes/ingest/2022-01-13_doi_crawl.md
new file mode 100644
index 0000000..a6f08dd
--- /dev/null
+++ b/notes/ingest/2022-01-13_doi_crawl.md
@@ -0,0 +1,248 @@
+
+Could roll this in to current patch crawl instead of starting a new crawl from scratch.
+
+This file is misnamed; these are mostly non-DOI-specific small updates.
+
+## KBART "almost complete" experimentation
+
+Random 10 releases:
+
+ cat missing_releases.json | shuf -n10 | jq .ident -r | awk '{print "https://fatcat.wiki/release/" $1}'
+ https://fatcat.wiki/release/suggmo4fnfaave64frttaqqoja - domain gone
+ https://fatcat.wiki/release/uw2dq2p3mzgolk4alze2smv7bi - DOAJ, then OJS PDF link. sandcrawler failed, fixed
+ https://fatcat.wiki/release/fjamhzxxdndq5dcariobxvxu3u - OJS; sandcrawler fix works
+ https://fatcat.wiki/release/z3ubnko5ifcnbhhlegc24kya2u - OJS; sandcrawler failed, fixed (separate pattern)
+ https://fatcat.wiki/release/pysc3w2cdbehvffbyca4aqex3i - DOAJ, OJS bilingual, failed with 'redirect-loop'. force re-crawl worked for one copy
+ https://fatcat.wiki/release/am2m5agvjrbvnkstke3o3xtney - not attempted previously (?), success
+ https://fatcat.wiki/release/4zer6m56zvh6fd3ukpypdu7ita - cover page of journal (not an article). via crossref
+ https://fatcat.wiki/release/6njc4rdaifbg5jye3bbfdhkbsu - OJS; success
+ https://fatcat.wiki/release/jnmip3z7xjfsdfeex4piveshvu - OJS; not crawled previously; success
+ https://fatcat.wiki/release/wjxxcknnpjgtnpbzhzge6rkndi - no-pdf-link, fixed
+
+Try some more!
+
+ https://fatcat.wiki/release/ywidvbhtfbettmfj7giu2htbdm - not attempted, success
+ https://fatcat.wiki/release/ou2kqv5k3rbk7iowfohpitelfa - OJS, not attempted, success?
+ https://fatcat.wiki/release/gv2glplmofeqrlrvfs524v5qa4 - scirp.org; 'redirect-loop'; HTML/PDF/XML all available; then 'gateway-timeout' on retry
+ https://fatcat.wiki/release/5r5wruxyyrf6jneorux3negwpe - gavinpublishers.com; broken site
+ https://fatcat.wiki/release/qk4atst6svg4hb73jdwacjcacu - horyzonty.ignatianum.edu.pl; broken DOI
+ https://fatcat.wiki/release/mp5ec3ycrjauxeve4n4weq7kqm - old cert; OJS; success
+ https://fatcat.wiki/release/sqnovcsmizckjdlwg3hipxrfqm - not attempted, success
+ https://fatcat.wiki/release/42ruewjuvbblxgnek6fpj5lp5m - OJS URL, but domain broken
+ https://fatcat.wiki/release/crg6aiypx5enveldvmwy5judp4 - volume/cover (stub)
+ https://fatcat.wiki/release/jzih3vvxj5ctxk3tbzyn5kokha - success
+
+
+## Seeds: fixed OJS URLs
+
+Made some recent changes to sandcrawler, should re-attempt OJS URLs, particularly from DOI or DOAJ, with pattern like:
+
+- `no-pdf-link` with terminal URL like `/article/view/`
+- `redirect-loop` with terminal URL like `/article/view/`
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND (
+ ingest_file_result.terminal_url LIKE '%/article/view/%'
+ OR ingest_file_result.terminal_url LIKE '%/article/download/%'
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json';
+ => COPY 326577
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json > /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json
+ cat /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Done/running.
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_file_result.status = 'redirect-loop'
+ OR ingest_file_result.status = 'link-loop'
+ )
+ AND (
+ ingest_file_result.terminal_url LIKE '%/article/view/%'
+ OR ingest_file_result.terminal_url LIKE '%/article/download/%'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt';
+ => COPY 342415
+
+ cat /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.schedule
+
+Done/seeded.
+
+## Seeds: scitemed.com
+
+Batch retry sandcrawler `no-pdf-link` with terminal URL like: `scitemed.com/article`
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_file_result.terminal_url LIKE '%/article/view/%'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_scitemed.2022-01-13.rows.json';
+ # SKIPPED
+
+Actually there are very few of these.
+
+## Seeds: non-OA paper DOIs
+
+There are many DOIs out there which are likely to be from small publishers, on
+the web, and would ingest just fine (eg, in OJS).
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' --count
+ 30,938,106
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'preservation:none' --count
+ 6,664,347
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'in_kbart:false' --count
+ 8,258,111
+
+Do the 8 million first, then maybe try the 30.9 million later? Do sampling to
+see how many are actually accessible? From experience with KBART generation,
+many of these are likely to crawl successfully.
+
+ ./fatcat_ingest.py --ingest-type pdf --allow-non-oa query 'in_ia:false is_oa:false doi:* release_type:article-journal container_id:* !publisher_type:big5 in_kbart:false' \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_nonoa_doi.json.gz
+ # re-running 2022-02-08 after this VM was upgraded
+ # Expecting 8321448 release objects in search queries
+ # DONE
+
+This is large enough that it will probably be a bulk ingest, and then probably
+a follow-up crawl.
+
+## Seeds: HTML and XML links from HTML biblio
+
+ kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -e \
+ | pv -l \
+ | rg '"(html|xml)_fulltext_url"' \
+ | rg '"no-pdf-link"' \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.json.gz
+
+ # cut this off at some point? gzip is terminated weird
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz | wc -l
+ # gzip: ingest_file_result_fulltext_urls.2022-01-13.json.gz: unexpected end of file
+ # 2,538,433
+
+Prepare seedlists (to include in heritrix patch crawl):
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \
+ | jq .html_biblio.xml_fulltext_url -r \
+ | rg '://' \
+ | sort -u -S 4G \
+ | pv -l \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz
+ # 1.24M 0:01:35 [12.9k/s]
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \
+ | jq .html_biblio.html_fulltext_url -r \
+ | rg '://' \
+ | sort -u -S 4G \
+ | pv -l \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz
+ # 549k 0:01:27 [6.31k/s]
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \
+ | cut -f3 -d/ \
+ | sort -S 4G \
+ | uniq -c \
+ | sort -nr \
+ | head -n20
+
+ 534005 dlc.library.columbia.edu
+ 355319 www.degruyter.com
+ 196421 zenodo.org
+ 101450 serval.unil.ch
+ 100631 biblio.ugent.be
+ 47986 digi.ub.uni-heidelberg.de
+ 39187 www.emerald.com
+ 33195 www.cairn.info
+ 25703 boris.unibe.ch
+ 19516 journals.openedition.org
+ 15911 academic.oup.com
+ 11091 repository.dl.itc.u-tokyo.ac.jp
+ 9847 oxfordworldsclassics.com
+ 9698 www.thieme-connect.de
+ 9552 www.idunn.no
+ 9265 www.zora.uzh.ch
+ 8030 www.scielo.br
+ 6543 www.hanspub.org
+ 6229 asmedigitalcollection.asme.org
+ 5651 brill.com
+
+ zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \
+ | awk '{print "F+ " $1}' \
+ > ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+
+ wc -l ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+ 1785901 ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule
+
+Added to `JOURNALS-PATCH-CRAWL-2022-01`
+
+## Seeds: most doi.org terminal non-success
+
+Unless it is a 404, should retry.
+
+TODO: generate this list
+
+## Non-OA DOI Bulk Ingest
+
+Had previously run:
+
+ cat ingest_nonoa_doi.json.gz \
+ | rg -v "doi.org/10.2139/" \
+ | rg -v "doi.org/10.1021/" \
+ | rg -v "doi.org/10.1121/" \
+ | rg -v "doi.org/10.1515/" \
+ | rg -v "doi.org/10.1093/" \
+ | rg -v "europepmc.org" \
+ | pv -l \
+ | gzip \
+ > nonoa_doi.filtered.ingests.json.gz
+ # 7.35M 0:01:13 [99.8k/s]
+
+Starting a bulk ingest of these on 2022-03-18, which is *before* the crawl has
+entirely finished, but after almost all queues (domains) have been done for
+several days.
+
+ zcat nonoa_doi.filtered.ingests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Looks like many jstage `no-capture` status; these are still (slowly) crawling.
diff --git a/notes/ingest/2022-03_doaj.md b/notes/ingest/2022-03_doaj.md
new file mode 100644
index 0000000..9722459
--- /dev/null
+++ b/notes/ingest/2022-03_doaj.md
@@ -0,0 +1,278 @@
+
+plan:
+- usual setup and dump ingest requests
+- filter ingest requests to targetted ccTLDs, and add those to crawl first
+
+## Transform and Load
+
+ # on sandcrawler-vm
+ mkdir -p /srv/sandcrawler/tasks/doaj
+ cd /srv/sandcrawler/tasks/doaj
+ wget 'https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2022-03-07_all.json.gz'
+
+ # in pipenv, in python directory
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz
+ # 9.08M 0:37:38 [4.02k/s]
+
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+ # Worker: Counter({'total': 9082373, 'insert-requests': 2982535, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 9082373, 'pushed': 9082373})
+
+
+## Check Pre-Crawl Status
+
+2022-03-09, before the above load:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 2919808
+ html | wrong-scope | 1098998
+ pdf | no-pdf-link | 481532
+ pdf | redirect-loop | 429006
+ html | success | 342501
+ html | unknown-scope | 225390
+ html | redirect-loop | 223927
+ html | html-resource-no-capture | 187762
+ html | no-capture | 185418
+ pdf | no-capture | 171273
+ pdf | null-body | 129028
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91551
+ pdf | link-loop | 25447
+ html | wrong-mimetype | 22640
+ html | wayback-content-error | 19028
+ html | terminal-bad-status | 13327
+ pdf | wrong-mimetype | 7688
+ xml | success | 6897
+ html | petabox-error | 5529
+ pdf | wayback-error | 2706
+ xml | null-body | 2353
+ pdf | | 2063
+ pdf | wayback-content-error | 1349
+ html | cdx-error | 1169
+ pdf | cdx-error | 1130
+ pdf | petabox-error | 679
+ html | | 620
+ pdf | empty-blob | 562
+ html | blocked-cookie | 545
+ (30 rows)
+
+After the above load:
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3036457
+ pdf | | 1623208
+ html | | 1208412
+ html | wrong-scope | 1108132
+ pdf | no-pdf-link | 485703
+ pdf | redirect-loop | 436085
+ html | success | 342594
+ html | unknown-scope | 225412
+ html | redirect-loop | 223927
+ html | html-resource-no-capture | 187999
+ html | no-capture | 187310
+ pdf | no-capture | 172033
+ pdf | null-body | 129266
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91799
+ pdf | link-loop | 26933
+ html | wrong-mimetype | 22643
+ html | wayback-content-error | 19028
+ html | terminal-bad-status | 13327
+ xml | | 11196
+ pdf | wrong-mimetype | 7929
+ xml | success | 6897
+ html | petabox-error | 5530
+ pdf | wayback-error | 2707
+ xml | null-body | 2353
+ pdf | wayback-content-error | 1353
+ pdf | cdx-error | 1177
+ html | cdx-error | 1172
+ pdf | petabox-error | 771
+ pdf | empty-blob | 562
+ (30 rows)
+
+Dump ingest requests for crawling (or bulk ingest first?):
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json';
+ => COPY 353819
+
+Not that many! Guess the filters are important?
+
+ SELECT COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ );
+ => 3202164
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json
+ => 353k 0:00:16 [21.0k/s]
+
+Bulk ingest:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Dump seeds again (for crawling):
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json';
+ # COPY 350661
+
+And stats again:
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3037059
+ pdf | | 1623208
+ html | | 1208412
+ html | wrong-scope | 1108476
+ pdf | no-pdf-link | 485705
+ pdf | redirect-loop | 436850
+ html | success | 342762
+ html | unknown-scope | 225412
+ html | redirect-loop | 224683
+ html | html-resource-no-capture | 188058
+ html | no-capture | 185734
+ pdf | no-capture | 170452
+ pdf | null-body | 129266
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91875
+ pdf | link-loop | 26933
+ html | wrong-mimetype | 22643
+ html | wayback-content-error | 19042
+ html | terminal-bad-status | 13333
+ xml | | 11196
+ pdf | wrong-mimetype | 7929
+ xml | success | 6898
+ html | petabox-error | 5535
+ pdf | wayback-error | 2711
+ xml | null-body | 2353
+ pdf | wayback-content-error | 1353
+ pdf | cdx-error | 1177
+ html | cdx-error | 1172
+ pdf | petabox-error | 772
+ html | blocked-cookie | 769
+ (30 rows)
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json
+
+Create seedlist:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \
+ | jq -r .base_url \
+ | sort -u -S 4G \
+ > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.txt
+
+Send off an added to `TARGETED-ARTICLE-CRAWL-2022-03` heritrix crawl, will
+re-ingest when that completes (a week or two?).
+
+
+## Bulk Ingest
+
+After `TARGETED-ARTICLE-CRAWL-2022-03` wrap-up.
+
+ # 2022-03-22
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2022-03_oaipmh.md b/notes/ingest/2022-03_oaipmh.md
new file mode 100644
index 0000000..d2a8d71
--- /dev/null
+++ b/notes/ingest/2022-03_oaipmh.md
@@ -0,0 +1,40 @@
+
+Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl.
+
+Note that Martin excluded many Indonesian endpoints, will need to follow-up on
+those.
+
+## Prep
+
+Fetch metadata snapshot:
+
+ wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst
+
+ wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst
+
+Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large):
+
+ zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \
+ | rg -v 'oai:kb.dk:' \
+ | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \
+ | rg -v 'oai:hispana.mcu.es:' \
+ | rg -v 'oai:bnf.fr:' \
+ | rg -v 'oai:ukm.si:' \
+ | rg -v 'oai:biodiversitylibrary.org:' \
+ | rg -v 'oai:hsp.org:' \
+ | rg -v 'oai:repec:' \
+ | rg -v 'oai:n/a:' \
+ | rg -v 'oai:quod.lib.umich.edu:' \
+ | rg -v 'oai:americanae.aecid.es:' \
+ | rg -v 'oai:www.irgrid.ac.cn:' \
+ | rg -v 'oai:espace.library.uq.edu:' \
+ | rg -v 'oai:edoc.mpg.de:' \
+ | rg -v 'oai:bibliotecadigital.jcyl.es:' \
+ | rg -v 'oai:repository.erciyes.edu.tr:' \
+ | rg -v 'oai:krm.or.kr:' \
+ | ./scripts/oai2ingestrequest.py - \
+ | pv -l \
+ | gzip \
+ > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz
+
+These failed to transform in the expected way; a change in JSON schema from last time?
diff --git a/notes/ingest/2022-04_targeted.md b/notes/ingest/2022-04_targeted.md
new file mode 100644
index 0000000..23fd35f
--- /dev/null
+++ b/notes/ingest/2022-04_targeted.md
@@ -0,0 +1,144 @@
+
+Want to do a crawl similar to recent "patch" crawls, where we run heritrix
+crawls to "fill in" missing (`no-capture`) and failed dailing ingests (aka,
+those requests coming from fatcat-changelog).
+
+ export PATCHDATE=2022-04-20
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-04
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+ COPY (
+ SELECT row_to_json(t) FROM (
+ SELECT ingest_file_result.terminal_url, ingest_request.*
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ -- AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'dblp'
+ OR ingest_request.link_source = 'pmc'
+ -- OR ingest_request.link_source = 'unpaywall'
+ -- OR ingest_request.link_source = 'oai'
+ )
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) t
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-04-20.rows.json';
+ # COPY 4842749
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+ | rg -v "\\\\" \
+ | jq -r .terminal_url \
+ | rg '://' \
+ | rg -i '^http' \
+ | rg -v www.archive.org \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+ # 4.75M 0:01:44 [45.4k/s]
+
+ # check top domains
+ cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+ 1515829 www.jstage.jst.go.jp
+ 1052953 doi.org
+ 241704 arxiv.org
+ 219543 www.sciencedirect.com
+ 178562 www.persee.fr
+ 84947 zenodo.org
+ 67397 www.mdpi.com
+ 65775 journals.lww.com
+ 58216 opg.optica.org
+ 50673 osf.io
+ 45776 www.degruyter.com
+ 36664 www.indianjournals.com
+ 35287 pubs.rsc.org
+ 33495 www.bmj.com
+ 33320 www.research-collection.ethz.ch
+ 29728 www.e-periodica.ch
+ 28338 iopscience.iop.org
+ 26364 www.cambridge.org
+ 23840 onlinelibrary.wiley.com
+ 23641 platform.almanhal.com
+ 22660 brill.com
+ 20288 www.osapublishing.org
+ 18561 cgscholar.com
+ 18539 doi.nrct.go.th
+ 15677 www.frontiersin.org
+
+ cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+
+ scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+TODO: starting with the "quarterly retry" script/query might make more sense?
+TODO: are there any cases where we do a bulk ingest request, fail, and `terminal_url` is not set?
+
+## Bulk Ingest Requests (post-crawl)
+
+ cd /srv/sandcrawler/src/python
+ sudo su sandcrawler
+ pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json
+ => 4.84M 0:03:14 [24.9k/s]
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => started 2022-05-11
diff --git a/notes/ingest/2022-04_unpaywall.md b/notes/ingest/2022-04_unpaywall.md
new file mode 100644
index 0000000..bc78998
--- /dev/null
+++ b/notes/ingest/2022-04_unpaywall.md
@@ -0,0 +1,278 @@
+
+New unpaywall snapshot from `2022-03-09`.
+
+This will probably be the last unpaywall crawl? Will switch to openalex in the
+future, because we can automate that ingest process, and run it on our own
+schedule.
+
+ export SNAPSHOT=2022-03-09
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=UNPAYWALL-CRAWL-2022-04
+
+## Download and Archive
+
+ wget 'https://unpaywall-data-snapshots.s3.us-west-2.amazonaws.com/unpaywall_snapshot_2022-03-09T083001.jsonl.gz'
+ # 2022-04-09 22:31:43 (98.9 KB/s) - ‘unpaywall_snapshot_2022-03-09T083001.jsonl.gz’ saved [29470830470/29470830470]
+
+ export SNAPSHOT=2022-03-09
+ ia upload unpaywall_snapshot_$SNAPSHOT unpaywall_snapshot_$SNAPSHOT*.jsonl.gz -m title:"Unpaywall Metadata Snapshot ($SNAPSHOT)" -m collection:ia_biblio_metadata -m creator:creator -m date:$SNAPSHOT
+
+ # if needed
+ scp unpaywall_snapshot_$SNAPSHOT*.jsonl.gz wbgrp-svc506.us.archive.org:/srv/sandcrawler/tasks
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ cd /srv/sandcrawler/src/python
+ sudo su sandcrawler
+ pipenv shell
+
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT*.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json
+ # 34.9M 3:02:32 [3.19k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ # 34.9M 5:23:15 [1.80k/s]
+ # Worker: Counter({'total': 34908779, 'insert-requests': 6129630, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 34908779, 'pushed': 34908779})
+
+So about 6.1M new ingest request rows.
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- take "all time" instead of just this recent capture
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2022-03-09.rows.json';
+ => COPY 6025671
+
+ # transform
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json
+ # 6.03M 0:03:26 [29.1k/s]
+
+ # enqueue for bulk processing
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2022-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 3330232
+ success | 2455102
+ redirect-loop | 197117
+ terminal-bad-status | 82618
+ no-pdf-link | 33046
+ blocked-cookie | 16078
+ link-loop | 6745
+ wrong-mimetype | 3416
+ wayback-error | 1385
+ empty-blob | 1142
+ cdx-error | 820
+ body-too-large | 292
+ bad-gzip-encoding | 281
+ wayback-content-error | 267
+ | 253
+ petabox-error | 215
+ skip-url-blocklist | 185
+ null-body | 179
+ spn2-cdx-lookup-failure | 89
+ gateway-timeout | 73
+ (20 rows)
+
+After prior "TARGETED" crawl and bulk ingest finished:
+
+ status | count
+ -------------------------+---------
+ no-capture | 3330055
+ success | 2455279
+ redirect-loop | 197117
+ terminal-bad-status | 82618
+ no-pdf-link | 33046
+ blocked-cookie | 16079
+ link-loop | 6745
+ wrong-mimetype | 3416
+ wayback-error | 1385
+ empty-blob | 1142
+ cdx-error | 820
+ body-too-large | 292
+ bad-gzip-encoding | 281
+ wayback-content-error | 267
+ | 253
+ petabox-error | 215
+ skip-url-blocklist | 185
+ null-body | 179
+ spn2-cdx-lookup-failure | 89
+ gateway-timeout | 73
+ (20 rows)
+
+Almost no change, which makes sense because of the `ingest_request.created`
+filter.
+
+
+## Dump Seedlist
+
+Dump rows for crawling:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ -- AND date(ingest_request.created) > '2022-04-01'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://doi.org/10.48550/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json';
+ => before ingest and arxiv.org DOI exclusion: COPY 3309091
+ => COPY 3308914
+
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json
+ => 3.31M 0:02:22 [23.2k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt | awk '{print "F+ " $1}' | shuf > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT*
+ 15 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.no_terminal_url.txt
+ 3308914 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json
+ 3028879 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.terminal_url.txt
+ 3038725 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.url.txt
+
+Inject seedlist into crawler:
+
+ scp /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/unpaywall_seedlist_$SNAPSHOT.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+Top domains?
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule | cut -f2 -d' ' | cut -f3 -d/ | sort -S 4G | uniq -c | sort -nr | head -n20
+ 158497 www.scielo.br
+ 144732 onlinelibrary.wiley.com
+ 129349 www.researchsquare.com
+ 94923 hal.archives-ouvertes.fr
+ 69293 openresearchlibrary.org
+ 64584 www.cell.com
+ 60033 link.springer.com
+ 50528 www.degruyter.com
+ 49737 projecteuclid.org
+ 45841 www.jstage.jst.go.jp
+ 44819 www.mdpi.com
+ 44325 ieeexplore.ieee.org
+ 38091 dr.lib.iastate.edu
+ 31030 www.nature.com
+ 30300 discovery.ucl.ac.uk
+ 27692 ntrs.nasa.gov
+ 24215 orca.cardiff.ac.uk
+ 23653 www.frontiersin.org
+ 23474 pure.rug.nl
+ 22660 www.sciencedirect.com
+
+
+## Post-Crawl bulk ingest
+
+ # enqueue for bulk processing
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # done: 2022-07-06
+
+## Post-Crawl, Post-Ingest Stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2022-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 4784948 => +2,329,669 ~77%
+ redirect-loop | 485270 => + 288,153 ~10%
+ no-capture | 317598 => -3,012,457
+ terminal-bad-status | 267853 => + 185,235 ~ 6%
+ no-pdf-link | 118303 => + 85,257
+ blocked-cookie | 111373 => + 95,294
+ skip-url-blocklist | 19368
+ link-loop | 9091
+ wrong-mimetype | 7163
+ cdx-error | 2516
+ empty-blob | 1961
+ wayback-error | 1922
+ body-too-large | 509
+ petabox-error | 416
+ wayback-content-error | 341
+ bad-gzip-encoding | 281
+ | 253
+ null-body | 179
+ spn2-cdx-lookup-failure | 89
+ gateway-timeout | 73
+ (20 rows)
+
+Groovy!
diff --git a/notes/ingest/2022-07-15_ingest_fixes.md b/notes/ingest/2022-07-15_ingest_fixes.md
new file mode 100644
index 0000000..ec31a7d
--- /dev/null
+++ b/notes/ingest/2022-07-15_ingest_fixes.md
@@ -0,0 +1,831 @@
+
+## HTML `html-resource-no-capture` Fixes
+
+Tracing down some `html-resource-no-capture` issues. Eg, `javascript:` resources causing errors.
+
+SQL query:
+
+ select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' limit 100;
+ select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' order by random() limit 100;
+
+ select count(*) from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture';
+ => 210,528
+
+http://agroengineering.it/index.php/jae/article/view/568/609
+- old capture, from `20171017204935`
+- missing .css file; seems like an actual case of missing content?
+- TODO: re-crawl/re-ingest when CDX is old
+
+https://www.karger.com/Article/FullText/484130
+- missing: https://www.karger.com/WebMaterial/ShowThumbnail/895999?imgType=2
+- resource is live
+- this was from DOI-LANDING crawl, no resources captured
+- TODO: re-crawl
+
+https://www.mdpi.com/1996-1073/13/21/5563/htm
+- missing: https://www.mdpi.com/1996-1073/13/21/5563/htm
+- common crawl capture; no/few resources?
+- TODO: re-crawl
+
+http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-736X2013000500011&lng=en&tlng=en
+- missing: http://www.scielo.br/img/revistas/pvb/v33n5/a11tab01.jpg
+ not on live web
+- old (2013) wide crawl
+- TODO: re-crawl
+
+http://g3journal.org/lookup/doi/10.1534/g3.116.027730
+- missing: http://www.g3journal.org/sites/default/files/highwire/ggg/6/8/2553/embed/mml-math-4.gif
+- old 2018 landing crawl (no resources)
+- TODO: re-crawl
+
+https://www.frontiersin.org/articles/10.3389/fimmu.2020.576134/full
+- "error_message": "revisit record missing URI and/or DT: warc:abc.net.au-news-20220328-130654/IA-FOC-abc.net.au-news-20220618135308-00003.warc.gz offset:768320762"
+- specific URL: https://www.frontiersin.org/areas/articles/js/app?v=uC9Es8wJ9fbTy8Rj4KipiyIXvhx7XEVhCTHvIrM4ShA1
+- archiveteam crawl
+- seems like a weird corner case. look at more 'frontiersin' articles, and re-crawl this page
+
+https://www.frontiersin.org/articles/10.3389/fonc.2020.01386/full
+- WORKING
+
+https://doi.org/10.4000/trajectoires.2317
+- redirect: https://journals.openedition.org/trajectoires/2317
+- missing: "https://journals.openedition.org/trajectoires/Ce fichier n'existe pas" (note spaces)
+- FIXED
+
+http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S1413-81232002000200008&lng=en&tlng=en
+- WORKING
+
+https://f1000research.com/articles/9-571/v2
+- petabox-error on 'https://www.recaptcha.net/recaptcha/api.js'
+- added recaptcha.net to blocklist
+- still needs a re-crawl
+- SPN capture, from 2020, but images were missing?
+- re-capture has images (though JS still wonky)
+- TODO: re-crawl with SPN2
+
+http://bio.biologists.org/content/4/9/1163
+- DOI LANDING crawl, no sub-resources
+- TODO: recrawl
+
+http://err.ersjournals.com/content/26/145/170039.full
+- missing: http://err.ersjournals.com/sites/default/files/highwire/errev/26/145/170039/embed/graphic-5.gif
+ on live web
+- 2017 targetted heritrix crawl
+- TODO: recrawl
+
+http://www.dovepress.com/synthesis-characterization-and-antimicrobial-activity-of-an-ampicillin-peer-reviewed-article-IJN
+- missing: https://www.dovepress.com/cr_data/article_fulltext/s61000/61143/img/IJN-61143-F02-Thumb.jpg
+- recent archiveteam crawl
+- TODO: recrawl
+
+http://journals.ed.ac.uk/lithicstudies/article/view/1444
+- missing: http://journals.ed.ac.uk/lithicstudies/article/download/1444/2078/6081
+- common crawl
+- TODO: recrawl
+
+http://medisan.sld.cu/index.php/san/article/view/495
+- missing: http://ftp.scu.sld.cu/galen/medisan/logos/redib.jpg
+- this single resource is legit missing
+
+seems like it probably isn't a bad idea to just re-crawl all of these with fresh SPNv2 requests
+
+request sources:
+- fatcat-changelog (doi)
+- fatcat-ingest (doi)
+- doaj
+
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'html'
+ AND ingest_file_result.status = 'html-resource-no-capture'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json';
+ => COPY 210749
+
+ ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json
+
+Try a sample of 300:
+
+ shuf -n300 /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Seeing a bunch of:
+
+ ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fphys.2020.00454/full","https://www.frontiersin.org/articles/10.3389/fphys.2020.00454/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"]
+ ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fmicb.2019.02507/full","https://www.frontiersin.org/articles/10.3389/fmicb.2019.02507/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"]
+ ["doaj","wayback-content-error","https://www.mdpi.com/2218-1989/10/9/366","https://www.mdpi.com/2218-1989/10/9/366/htm","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:964129887"]
+
+ "error_message": "revisit record missing URI and/or DT: warc:online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz offset:751923069",
+
+
+ ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fnins.2020.00724/full","https://www.frontiersin.org/articles/10.3389/fnins.2020.00724/full","wayback payload sha1hex mismatch: 20220715222216 https://static.frontiersin.org/areas/articles/js/app?v=DfnFHSIgqDJBKQy2bbQ2S8vWyHe2dEMZ1Lg9o6vSS1g1"]
+
+These seem to be transfer encoding issues; fixed?
+
+ ["doaj","html-resource-no-capture","http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S0021-25712013000400003&lng=en&tlng=en","https://scielosp.org/article/aiss/2013.v49n4/336-339/en/","HTML sub-resource not found: https://ssm.scielo.org/media/assets/css/scielo-print.css"]
+
+Full batch:
+
+ # TODO: cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Not running the full batch for now, because there are almost all `wayback-content-error` issues.
+
+ cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | wc -l
+ 114935
+
+ cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+
+## Redirect Loops
+
+Seems like there might have been a bug in how ingest pipeline dealt with
+multiple redirects (eg, 301 to 302 or vice-versa), due to how CDX lookups and
+normalization was happening.
+
+This could be a really big deal because we have over 11 million such ingest
+requests! and may even have stopped crawling domains on the basis of redirect
+looping.
+
+ select * from ingest_file_result where ingest_type = 'pdf' and status = 'redirect-loop' limit 50;
+
+http://ieeexplore.ieee.org/iel7/7259950/7275573/07275755.pdf
+- 'skip-url-blocklist'
+- paywall on live web
+
+http://www.redjournal.org/article/S0360301616308276/pdf
+- redirect to 'secure.jbs.elsevierhealth.com'
+- ... but re-crawling with SPNv2 worked
+- TODO: reingest this entire journal with SPNv2
+
+http://www.jmirs.org/article/S1939865415001551/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL: success
+
+http://www.cell.com/article/S0006349510026147/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- TODO: try SPNv2?
+- RECRAWL: success
+
+http://infoscience.epfl.ch/record/256431/files/SPL_2018.pdf
+- FIXED: success
+
+http://www.nature.com/articles/hdy1994143.pdf
+- blocked-cookie (idp.nature.com / cookies_not_supported)
+- RECRAWL: gateway-timeout
+
+http://www.thelancet.com/article/S0140673619327606/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL: success
+
+https://pure.mpg.de/pubman/item/item_2065970_2/component/file_2065971/Haase_2014.pdf
+- FIXED: success
+
+http://hdl.handle.net/21.11116/0000-0001-B1A2-F
+- FIXED: success
+
+http://repositorio.ufba.br/ri/bitstream/ri/6072/1/%2858%29v21n6a03.pdf
+- FIXED: success
+
+http://www.jto.org/article/S1556086416329999/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL spn2: success
+
+http://www.jahonline.org/article/S1054139X16303020/pdf
+- blocked-cookie (secure.jbs.elsevierhealth.com)
+- RECRAWL spn2: success
+
+So, wow wow wow, a few things to do here:
+
+- just re-try all these redirect-loop attempts to update status
+- re-ingest all these elsevierhealth blocked crawls with SPNv2. this could take a long time!
+
+Possibly the elsevierhealth stuff will require some deeper fiddling to crawl
+correctly.
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.status = 'redirect-loop'
+ -- AND ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json';
+ => COPY 6611342
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json
+
+Start with a sample:
+
+ shuf -n200 /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Wow that is a lot of ingest! And a healthy fraction of 'success', almost all
+via unpaywall (maybe should have done DOAJ/DOI only first). Let's do this full
+batch:
+
+ cat /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+TODO: repeat with broader query (eg, OAI-PMH, MAG, etc).
+
+## Other
+
+Revist resolution failed: \"Didn't get exact CDX url/datetime match. url:https://www.cairn.info/static/images//logo/logo-cairn-negatif.png dt:20220430145322 got:CdxRow(surt='info,cairn)/static/images/logo/logo-cairn-negatif.png', datetime='20220430145322', url='https://www.cairn.info/static/images/logo/logo-cairn-negatif.png', mimetype='image/png', status_code=200, sha1b32='Y3VQOPO2NFUR2EUWNXLYGYGNZPZLQYHU', sha1hex='c6eb073dda69691d12966dd78360cdcbf2b860f4', warc_csize=10875, warc_offset=2315284914, warc_path='archiveteam_archivebot_go_20220430212134_59230631/old.worldurbancampaign.org-inf-20220430-140628-acnq5-00000.warc.gz')\""
+
+ https://www.cairn.info/static/images//logo/logo-cairn-negatif.png 20220430145322
+ https://www.cairn.info/static/images/logo/logo-cairn-negatif.png 20220430145322
+
+Fixed!
+
+
+## Broken WARC Record?
+
+cdx line:
+
+ net,cloudfront,d1bxh8uas1mnw7)/assets/embed.js 20220716084026 https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js warc/revisit - U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB - - 660 751923069 online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz
+
+download WARC and run:
+
+ zcat IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz | rg d1bxh8uas1mnw7.cloudfront.net/assets/embed.js -a -C 20
+
+the WARC record:
+
+ WARC/1.0
+ WARC-Type: revisit
+ WARC-Target-URI: https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js
+ WARC-Date: 2022-07-16T08:40:26Z
+ WARC-Payload-Digest: sha1:U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB
+ WARC-IP-Address: 13.227.21.220
+ WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+ WARC-Truncated: length
+ WARC-Record-ID: <urn:uuid:cc79139e-d43f-4b43-9b9e-f923610344d0>
+ Content-Type: application/http; msgtype=response
+ Content-Length: 493
+
+ HTTP/1.1 200 OK
+ Content-Type: application/javascript
+ Content-Length: 512
+ Connection: close
+ Last-Modified: Fri, 22 Apr 2022 08:45:38 GMT
+ Accept-Ranges: bytes
+ Server: AmazonS3
+ Date: Fri, 15 Jul 2022 16:36:08 GMT
+ ETag: "1c28db48d4012f0221b63224a3bb7137"
+ Vary: Accept-Encoding
+ X-Cache: Hit from cloudfront
+ Via: 1.1 5b475307685b5cecdd0df414286f5438.cloudfront.net (CloudFront)
+ X-Amz-Cf-Pop: SFO20-C1
+ X-Amz-Cf-Id: SIRR_1LT8mkp3QVaiGYttPuomxyDfJ-vB6dh0Slg_qqyW0_WwnA1eg==
+ Age: 57859
+
+where are the `WARC-Refers-To-Target-URI` and `WARC-Refers-To-Date` lines?
+
+## osf.io
+
+ select status, terminal_status_code, count(*) from ingest_file_result where base_url LIKE 'https://doi.org/10.17605/osf.io/%' and ingest_type = 'pdf' group by status, terminal_status_code order by count(*) desc limit 30;
+
+ status | terminal_status_code | count
+ -------------------------+----------------------+-------
+ terminal-bad-status | 404 | 92110
+ no-pdf-link | 200 | 46932
+ not-found | 200 | 20212
+ no-capture | | 8599
+ success | 200 | 7604
+ redirect-loop | 301 | 2125
+ terminal-bad-status | 503 | 1657
+ cdx-error | | 1301
+ wrong-mimetype | 200 | 901
+ terminal-bad-status | 410 | 364
+ read-timeout | | 167
+ wayback-error | | 142
+ gateway-timeout | | 139
+ terminal-bad-status | 500 | 76
+ spn2-error | | 63
+ spn2-backoff | | 42
+ petabox-error | | 39
+ spn2-backoff | 200 | 27
+ redirect-loop | 302 | 19
+ terminal-bad-status | 400 | 15
+ terminal-bad-status | 401 | 15
+ remote-server-error | | 14
+ timeout | | 11
+ terminal-bad-status | | 11
+ petabox-error | 200 | 10
+ empty-blob | 200 | 8
+ null-body | 200 | 6
+ spn2-error:unknown | | 5
+ redirect-loop | 308 | 4
+ spn2-cdx-lookup-failure | | 4
+ (30 rows)
+
+Many of these are now non-existant, or datasets/registrations not articles.
+Hrm.
+
+
+## Large DOAJ no-pdf-link Domains
+
+ SELECT
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain,
+ COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_request.base_url = ingest_file_result.base_url
+ WHERE
+ ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.link_source = 'doaj'
+ GROUP BY
+ domain
+ ORDER BY
+ COUNT(*) DESC
+ LIMIT 50;
+
+ domain | count
+ -------------------------------------------------------+--------
+ www.sciencedirect.com | 211090
+ auth.openedition.org | 20741
+ journal.frontiersin.org:80 | 11368
+ journal.frontiersin.org | 6494
+ ejde.math.txstate.edu | 4301
+ www.arkat-usa.org | 4001
+ www.scielo.br | 3736
+ www.lcgdbzz.org | 2892
+ revistas.uniandes.edu.co | 2715
+ scielo.sld.cu | 2612
+ www.egms.de | 2488
+ journals.lww.com | 2415
+ ter-arkhiv.ru | 2239
+ www.kitlv-journals.nl | 2076
+ www.degruyter.com | 2061
+ jwcn-eurasipjournals.springeropen.com | 1929
+ www.cjcnn.org | 1908
+ www.aimspress.com | 1885
+ vsp.spr-journal.ru | 1873
+ dx.doi.org | 1648
+ www.dlib.si | 1582
+ aprendeenlinea.udea.edu.co | 1548
+ www.math.u-szeged.hu | 1448
+ dergipark.org.tr | 1444
+ revistas.uexternado.edu.co | 1429
+ learning-analytics.info | 1419
+ drive.google.com | 1399
+ www.scielo.cl | 1326
+ www.economics-ejournal.org | 1267
+ www.jssm.org | 1240
+ html.rhhz.net | 1232
+ journalofinequalitiesandapplications.springeropen.com | 1214
+ revistamedicina.net | 1197
+ filclass.ru | 1154
+ ceramicayvidrio.revistas.csic.es | 1152
+ gynecology.orscience.ru | 1126
+ www.tobaccoinduceddiseases.org | 1090
+ www.tandfonline.com | 1046
+ www.querelles-net.de | 1038
+ www.swjpcc.com | 1032
+ microbiologyjournal.org | 1028
+ revistas.usal.es | 1027
+ www.medwave.cl | 1023
+ ijtech.eng.ui.ac.id | 1023
+ www.scielo.sa.cr | 1021
+ vestnik.szd.si | 986
+ www.biomedcentral.com:80 | 984
+ scielo.isciii.es | 983
+ bid.ub.edu | 970
+ www.meirongtv.com | 959
+ (50 rows)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ejde.math.txstate.edu%' limit 5;
+ http://ejde.math.txstate.edu/Volumes/2018/30/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2012/137/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2016/268/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2015/194/abstr.html
+ http://ejde.math.txstate.edu/Volumes/2014/43/abstr.html
+ # plain HTML, not really parse-able
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.arkat-usa.org%' limit 5;
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0013.909
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0007.717
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.p008.158
+ https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0014.216
+ # fixed (embed PDF)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.scielo.br%' limit 5;
+ https://doi.org/10.5935/0034-7280.20200075
+ https://doi.org/10.5935/0004-2749.20200071
+ https://doi.org/10.5935/0034-7280.20200035
+ http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1516-44461999000400014
+ https://doi.org/10.5935/0034-7280.20200047
+ # need recrawls?
+ # then success
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.lcgdbzz.org%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://revistas.uniandes.edu.co%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://scielo.sld.cu%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.egms.de%' limit 5;
+ https://doi.org/10.3205/16dgnc020
+ http://nbn-resolving.de/urn:nbn:de:0183-19degam1126
+ http://www.egms.de/en/meetings/dgpraec2019/19dgpraec032.shtml
+ http://www.egms.de/en/meetings/dkou2019/19dkou070.shtml
+ http://nbn-resolving.de/urn:nbn:de:0183-20nrwgu625
+ # mostly abstracts, don't have PDF versions
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ter-arkhiv.ru%' limit 5;
+ https://doi.org/10.26442/terarkh201890114-47
+ https://doi.org/10.26442/00403660.2019.12.000206
+ https://journals.eco-vector.com/0040-3660/article/download/32246/pdf
+ https://journals.eco-vector.com/0040-3660/article/download/33578/pdf
+ https://doi.org/10.26442/00403660.2019.12.000163
+ # working, needed recrawls (some force re-crawls)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.kitlv-journals.nl%' limit 5;
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.cjcnn.org%' limit 5;
+
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.dlib.si%' limit 5;
+ https://srl.si/ojs/srl/article/view/2910
+ https://srl.si/ojs/srl/article/view/3640
+ https://srl.si/ojs/srl/article/view/2746
+ https://srl.si/ojs/srl/article/view/2557
+ https://srl.si/ojs/srl/article/view/2583
+ # fixed? (dlib.si)
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.jssm.org%' limit 5;
+ http://www.jssm.org/vol4/n4/8/v4n4-8text.php
+ http://www.jssm.org/vol7/n1/19/v7n1-19text.php
+ http://www.jssm.org/vol9/n3/10/v9n3-10text.php
+ http://www.jssm.org/abstresearcha.php?id=jssm-14-347.xml
+ http://www.jssm.org/vol7/n2/11/v7n2-11text.php
+ # works as an HTML document? otherwise hard to select on PDF link
+
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://filclass.ru%' limit 5;
+ https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism
+ https://filclass.ru/en/archive/2015/42/training-as-an-effective-form-of-preparation-for-the-final-essay
+ https://filclass.ru/en/archive/2020/vol-25-3/didaktizatsiya-literatury-rossijskikh-nemtsev-zanyatie-po-poeme-viktora-klyajna-jungengesprach
+ https://filclass.ru/en/archive/2015/40/the-communicative-behaviour-of-the-russian-intelligentsia-and-its-reflection-in-reviews-as-a-genre-published-in-online-literary-journals-abroad
+ https://filclass.ru/en/archive/2016/46/discoursive-means-of-implication-of-instructive-components-within-the-anti-utopia-genre
+ # fixed
+ # TODO: XXX: re-crawl/ingest
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://microbiologyjournal.org%' limit 5;
+ https://microbiologyjournal.org/the-relationship-between-the-type-of-infection-and-antibiotic-resistance/
+ https://microbiologyjournal.org/antimicrobial-resistant-shiga-toxin-producing-escherichia-coli-isolated-from-ready-to-eat-meat-products-and-fermented-milk-sold-in-the-formal-and-informal-sectors-in-harare-zimbabwe/
+ https://microbiologyjournal.org/emerging-antibiotic-resistance-in-mycoplasma-microorganisms-designing-effective-and-novel-drugs-therapeutic-targets-current-knowledge-and-futuristic-prospects/
+ https://microbiologyjournal.org/microbiological-and-physicochemicalpropertiesofraw-milkproduced-from-milking-to-delivery-to-milk-plant/
+ https://microbiologyjournal.org/association-of-insulin-based-insulin-resistance-with-liver-biomarkers-in-type-2-diabetes-mellitus/
+ # HTML article, no PDF
+ # ... but only sometimes
+
+ select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.medwave.cl%' limit 5;
+ http://www.medwave.cl/link.cgi/Medwave/Perspectivas/Cartas/6878
+ https://www.medwave.cl/link.cgi/Medwave/Revisiones/RevisionClinica/8037.act
+ http://dx.doi.org/10.5867/medwave.2012.03.5332
+ https://www.medwave.cl/link.cgi/Medwave/Estudios/Casos/7683.act
+ http://www.medwave.cl/link.cgi/Medwave/Revisiones/CAT/5964
+ # HTML article, no PDF
+
+Re-ingest HTML:
+
+ https://fatcat.wiki/container/mafob4ewkzczviwipyul7knndu (DONE)
+ https://fatcat.wiki/container/6rgnsrp3rnexdoks3bxcmbleda (DONE)
+
+Re-ingest PDF:
+
+ doi_prefix:10.5935 (DONE)
+ doi_prefix:10.26442
+
+## More Scielo
+
+More scielo? `doi_prefix:10.5935 in_ia:false`
+
+ http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873
+ # OJS? fixed
+
+ https://revistas.unicentro.br/index.php/repaa/article/view/2667/2240
+ # working, but needed re-crawl
+
+ http://www.rbcp.org.br/details/2804/piezoelectric-preservative-rhinoplasty--an-alternative-approach-for-treating-bifid-nose-in-tessier-no--0-facial-cleft
+
+A few others, mostly now working
+
+## Recent OA DOIs
+
+ fatcat-cli search release 'is_oa:true (type:article-journal OR type:article OR type:paper-conference) !doi_prefix:10.5281 !doi_prefix:10.6084 !doi_prefix:10.48550 !doi_prefix:10.25446 !doi_prefix:10.25384 doi:* date:>2022-06-15 date:<2022-07-15 in_ia:false !publisher_type:big5' --index-json --limit 0 | pv -l > recent_missing_oa.json
+
+ wc -l recent_missing_oa.json
+ 24433
+
+ cat recent_missing_oa.json | jq .doi_prefix -r | sort | uniq -c | sort -nr | head
+ 4968 10.3390
+ 1261 10.1080
+ 687 10.23668
+ 663 10.1021
+ 472 10.1088
+ 468 10.4000
+ 367 10.3917
+ 357 10.1364
+ 308 10.4230
+ 303 10.17863
+
+ cat recent_missing_oa.json | jq .doi_registrar -r | sort | uniq -c | sort -nr
+ 19496 crossref
+ 4836 datacite
+ 101 null
+
+ cat recent_missing_oa.json | jq .publisher_type -r | sort | uniq -c | sort -nr
+ 9575 longtail
+ 8419 null
+ 3861 society
+ 822 unipress
+ 449 oa
+ 448 scielo
+ 430 commercial
+ 400 repository
+ 22 other
+ 7 archive
+
+ cat recent_missing_oa.json | jq .publisher -r | sort | uniq -c | sort -nr | head
+ 4871 MDPI AG
+ 1107 Informa UK (Taylor & Francis)
+ 665 EAG-Publikationen
+ 631 American Chemical Society
+ 451 IOP Publishing
+ 357 The Optical Society
+ 347 OpenEdition
+ 309 CAIRN
+ 308 Schloss Dagstuhl - Leibniz-Zentrum für Informatik
+ 303 Apollo - University of Cambridge Repository
+
+ cat recent_missing_oa.json | jq .container_name -r | sort | uniq -c | sort -nr | head
+ 4908 null
+ 378 Sustainability
+ 327 ACS Omega
+ 289 Optics Express
+ 271 International Journal of Environmental Research and Public Health
+ 270 International Journal of Health Sciences
+ 238 Sensors
+ 223 International Journal of Molecular Sciences
+ 207 Molecules
+ 193 Proceedings of the National Academy of Sciences of the United States of America
+
+ cat recent_missing_oa.json \
+ | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \
+ | wc -l
+ 16558
+
+ cat recent_missing_oa.json | rg -i mdpi | shuf -n10 | jq .doi -r
+ 10.3390/molecules27144419
+ => was a 404
+ => recrawl was successful
+ 10.3390/math10142398
+ => was a 404
+ 10.3390/smartcities5030039
+ => was a 404
+
+Huh, we need to re-try/re-crawl MDPI URLs every week or so? Or special-case this situation.
+Could be just a fatcat script, or a sandcrawler query.
+
+ cat recent_missing_oa.json \
+ | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \
+ | shuf -n10 | jq .doi -r
+
+ https://doi.org/10.18452/24860
+ => success (just needed quarterly retry?)
+ => b8c6c86aebd6cd2d85515441bbce052bcff033f2 (not in fatcat.wiki)
+ => current status is "bad-redirect"
+ https://doi.org/10.26181/20099540.v1
+ => success
+ => 3f9b1ff2a09f3ea9051dbbef277579e8a0b4df30
+ => this is figshare, and versioned. PDF was already attached to another DOI: https://doi.org/10.26181/20099540
+ https://doi.org/10.4230/lipics.sea.2022.22
+ => there is a bug resulting in trailing slash in `citation_pdf_url`
+ => fixed as a quirks mode
+ => emailed to report
+ https://doi.org/10.3897/aca.5.e89679
+ => success
+ => e6fd1e066c8a323dc56246631748202d5fb48808
+ => current status is 'bad-redirect'
+ https://doi.org/10.1103/physrevd.105.115035
+ => was 404
+ => success after force-recrawl of the terminal URL (not base URL)
+ https://doi.org/10.1155/2022/4649660
+ => was 404
+ => success after force-recrawl (of base_url)
+ https://doi.org/10.1090/spmj/1719
+ => paywall (not actually OA)
+ => https://fatcat.wiki/container/x6jfhegb3fbv3bcbqn2i3espiu is on Szczepanski list, but isn't all OA?
+ https://doi.org/10.1139/as-2022-0011
+ => was no-pdf-link
+ => fixed fulltext URL extraction
+ => still needed to re-crawl terminal PDF link? hrm
+ https://doi.org/10.31703/grr.2022(vii-ii).02
+ => was no-pdf-link
+ => fixed! success
+ https://doi.org/10.1128/spectrum.00154-22
+ => was 404
+ => now repeatably 503, via SPN
+ https://doi.org/10.51601/ijersc.v3i3.393
+ => 503 server error
+ https://doi.org/10.25416/ntr.20137379.v1
+ => is figshare
+ => docx (not PDF)
+ https://doi.org/10.25394/pgs.20263698.v1
+ => figshare
+ => embargo'd
+ https://doi.org/10.24850/j-tyca-14-4-7
+ => was no-pdf-link
+ => docs.google.com/viewer (!)
+ => now handle this (success)
+ https://doi.org/10.26267/unipi_dione/1832
+ => was bad-redirect
+ => success
+ https://doi.org/10.25560/98019
+ => body-too-large
+ => also, PDF metadata fails to parse
+ => is actually like 388 MByte
+ https://doi.org/10.14738/abr.106.12511
+ => max-hops-exceeded
+ => bumped max-hops from 6 to 8
+ => then success (via google drive)
+ https://doi.org/10.24350/cirm.v.19933803
+ => video, not PDF
+ https://doi.org/10.2140/pjm.2022.317.67
+ => link-loop
+ => not actually OA
+ https://doi.org/10.26265/polynoe-2306
+ => was bad-redirect
+ => now success
+ https://doi.org/10.3389/fpls.2022.826875
+ => frontiers
+ => was terminal-bad-status (403)
+ => success on retry (not sure why)
+ => maybe this is also a date-of-publication thing?
+ => not sure all these should be retried though
+ https://doi.org/10.14198/medcom.22240
+ => was terminal-bad-status (404)
+ => force-recrawl resulted in an actual landing page, but still no-pdf-link
+ => but actual PDF is a real 404, it seems. oh well
+ https://doi.org/10.31729/jnma.7579
+ => no-capture
+ https://doi.org/10.25373/ctsnet.20146931.v2
+ => figshare
+ => video, not document or PDF
+ https://doi.org/10.1007/s42600-022-00224-0
+ => not yet crawled/attempted (!)
+ => springer
+ => not actually OA
+ https://doi.org/10.37391/ijeer.100207
+ => some upstream issue (server not found)
+ https://doi.org/10.1063/5.0093946
+ => aip.scitation.org, is actually OA (can download in browser)
+ => cookie trap?
+ => redirect-loop (seems like a true redirect loop)
+ => retrying the terminal PDF URL seems to have worked
+ https://doi.org/10.18502/jchr.v11i2.9998
+ => no actual fulltext on publisher site
+ https://doi.org/10.1128/spectrum.01144-22
+ => this is a 503 error, even after retrying. weird!
+
+DONE: check `publisher_type` in chocula for:
+- "MDPI AG"
+- "Informa UK (Taylor & Francis)"
+
+ cat recent_missing_oa.json | jq '[.publisher, .publisher_type]' -c | sort | uniq -c | sort -nr | head -n40
+ 4819 ["MDPI AG","longtail"]
+ 924 ["Informa UK (Taylor & Francis)",null]
+ 665 ["EAG-Publikationen",null]
+ 631 ["American Chemical Society","society"]
+ 449 ["IOP Publishing","society"]
+ 357 ["The Optical Society","society"]
+ 336 ["OpenEdition","oa"]
+ 309 ["CAIRN","repository"]
+ 308 ["Schloss Dagstuhl - Leibniz-Zentrum für Informatik",null]
+ 303 ["Apollo - University of Cambridge Repository",null]
+ 292 ["Springer (Biomed Central Ltd.)",null]
+ 275 ["Purdue University Graduate School",null]
+ 270 ["Suryasa and Sons","longtail"]
+ 257 ["La Trobe",null]
+ 216 ["Frontiers Media SA","longtail"]
+ 193 ["Proceedings of the National Academy of Sciences","society"]
+ 182 ["Informa UK (Taylor & Francis)","longtail"]
+ 176 ["American Physical Society","society"]
+ 168 ["Institution of Electrical Engineers","society"]
+ 166 ["Oxford University Press","unipress"]
+ 153 ["Loughborough University",null]
+
+ chocula mostly seems to set these correctly. is the issue that the chocula
+ computed values aren't coming through or getting updated? probably. both
+ the release (from container) metadata update; and chocula importer not
+ doing updates based on this field; and some old/incorrect values.
+
+ did some cleanups of specific containers, and next chocula update should
+ result in a bunch more `publisher_type` getting populated on older
+ containers
+
+
+TODO: verify URLs are actualy URLs... somewhere? in the ingest pipeline
+
+TODO: fatcat: don't ingest figshare "work" DOIs, only the "versioned" ones (?)
+ doi_prefix:10.26181
+
+WIP: sandcrawler: regularly (weekly?) re-try 404 errors (the terminal URL, not the base url?) (or, some kind of delay?)
+ doi_prefix:10.3390 (MDPI)
+ doi_prefix:10.1103
+ doi_prefix:10.1155
+
+DONE: simply re-ingest all:
+ doi_prefix:10.4230
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf query 'doi_prefix:10.4230'
+ # Counter({'ingest_request': 2096, 'elasticsearch_release': 2096, 'estimate': 2096, 'kafka': 2096})
+ container_65lzi3vohrat5nnymk3dqpoycy
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 65lzi3vohrat5nnymk3dqpoycy
+ # Counter({'ingest_request': 187, 'elasticsearch_release': 187, 'estimate': 187, 'kafka': 187})
+ container_5vp2bio65jdc3blx6rfhp3chde
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 5vp2bio65jdc3blx6rfhp3chde
+ # Counter({'ingest_request': 83, 'elasticsearch_release': 83, 'estimate': 83, 'kafka': 83})
+
+DONE: verify and maybe re-ingest all:
+ is_oa:true publisher:"Canadian Science Publishing" in_ia:false
+
+ ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --allow-non-oa --ingest-type pdf --force-recrawl query 'year:>2010 is_oa:true publisher:"Canadian Science Publishing" in_ia:false !journal:print'
+ # Counter({'ingest_request': 1041, 'elasticsearch_release': 1041, 'estimate': 1041, 'kafka': 1041})
+
+
+## Re-Ingest bad-redirect, max-hops-exceeded, and google drive
+
+Similar to `redirect-loop`:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.status = 'bad-redirect'
+ -- AND ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json';
+ # COPY 100011
+ # after first run: COPY 5611
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.status = 'max-hops-exceeded'
+ -- AND ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json';
+ # COPY 3546
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.hit is false
+ AND ingest_file_result.terminal_url like 'https://docs.google.com/viewer%'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json';
+ # COPY 1082
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json
+
+ cat /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ cat /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ cat /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ # DONE
diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md
new file mode 100644
index 0000000..74aeb8d
--- /dev/null
+++ b/notes/ingest/2022-07-19_dblp.md
@@ -0,0 +1,50 @@
+
+Cross-posting from fatcat bulk metadata update/ingest.
+
+ zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 631k 0:00:11 [54.0k/s]
+
+
+## Post-Crawl Stats
+
+This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run
+2022-09-06:
+
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'dblp'
+ GROUP BY ingest_request.ingest_type, status
+ -- ORDER BY ingest_request.ingest_type, COUNT DESC
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+-----------------------+--------
+ pdf | success | 305142
+ pdf | no-pdf-link | 192683
+ pdf | no-capture | 42634
+ pdf | terminal-bad-status | 38041
+ pdf | skip-url-blocklist | 31055
+ pdf | link-loop | 9263
+ pdf | wrong-mimetype | 4545
+ pdf | redirect-loop | 3952
+ pdf | empty-blob | 2705
+ pdf | wayback-content-error | 834
+ pdf | wayback-error | 294
+ pdf | petabox-error | 202
+ pdf | blocked-cookie | 155
+ pdf | cdx-error | 115
+ pdf | body-too-large | 66
+ pdf | bad-redirect | 19
+ pdf | timeout | 7
+ pdf | bad-gzip-encoding | 4
+ (18 rows)
+
+That is quite a lot of `no-pdf-link`, might be worth doing a random sample
+and/or re-ingest. And a chunk of `no-capture` to retry.
diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md
new file mode 100644
index 0000000..7e55633
--- /dev/null
+++ b/notes/ingest/2022-07_doaj.md
@@ -0,0 +1,199 @@
+
+This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for
+heritrix bulk crawling, along with JALC and DOAJ URLs.
+
+ export SNAPSHOT=2022-07-20
+
+## Transform and Load
+
+ # on sandcrawler-vm
+ mkdir -p /srv/sandcrawler/tasks/doaj
+ cd /srv/sandcrawler/tasks/doaj
+ wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz"
+
+ # in pipenv, in python directory
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz
+ # 9.72M 0:36:28 [4.44k/s]
+
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+ # 9.72M 0:17:04 [9.49k/s]
+ # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097})
+
+Stats after this load:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3165539
+ pdf | | 2078874
+ html | | 1547698
+ html | wrong-scope | 1114332
+ pdf | no-pdf-link | 517261
+ html | success | 388376
+ html | unknown-scope | 242044
+ pdf | no-capture | 179030
+ pdf | terminal-bad-status | 174741
+ html | no-capture | 155323
+ pdf | null-body | 129267
+ pdf | redirect-loop | 127136
+ html | html-resource-no-capture | 117275
+ html | null-body | 100296
+ pdf | blocked-cookie | 71093
+ html | redirect-loop | 65519
+ html | terminal-bad-status | 64856
+ html | blocked-cookie | 64095
+ html | spn2-backoff | 55173
+ pdf | link-loop | 27440
+ html | wrong-mimetype | 26016
+ html | wayback-content-error | 20109
+ xml | | 13624
+ pdf | wrong-mimetype | 8411
+ xml | success | 6899
+ html | petabox-error | 6199
+ html | wayback-error | 5269
+ html | spn2-cdx-lookup-failure | 4635
+ html | spn2-recent-capture | 4527
+ xml | null-body | 2353
+ (30 rows)
+
+## Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json';
+ # COPY 3962331
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json
+ # 3.96M 0:01:47 [36.7k/s]
+
+Top domains:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+ 789988 www.mdpi.com
+ 318142 www.frontiersin.org
+ 226316 link.springer.com
+ 204429 www.scielo.br
+ 201175 www.sciencedirect.com
+ 72852 ieeexplore.ieee.org
+ 68983 dx.doi.org
+ 33286 www.dovepress.com
+ 26020 elifesciences.org
+ 23838 www.cetjournal.it
+ 21102 mab-online.nl
+ 20242 www.revistas.usp.br
+ 16564 periodicos.uem.br
+ 15710 journals.openedition.org
+ 14514 dergipark.org.tr
+ 14072 apcz.umk.pl
+ 13924 ojs.minions.amsterdam
+ 13717 bmgn-lchr.nl
+ 13512 ojstest.minions.amsterdam
+ 10440 journals.asm.org
+
+Bulk ingest:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # Done
+
+## Stats Again
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- ORDER BY ingest_request.ingest_type, COUNT DESC
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 4704006
+ html | wrong-scope | 1761227
+ html | success | 778165
+ pdf | no-pdf-link | 759805
+ html | no-capture | 382080
+ html | unknown-scope | 313391
+ html | html-resource-no-capture | 292953
+ pdf | no-capture | 290311
+ pdf | terminal-bad-status | 271776
+ pdf | null-body | 129267
+ pdf | blocked-cookie | 108491
+ html | terminal-bad-status | 103014
+ html | null-body | 100296
+ html | blocked-cookie | 88533
+ pdf | | 81517
+ pdf | skip-url-blocklist | 76443
+ html | spn2-backoff | 50615
+ pdf | link-loop | 45516
+ html | wrong-mimetype | 33525
+ html | wayback-content-error | 25535
+ pdf | empty-blob | 21431
+ pdf | redirect-loop | 19795
+ html | petabox-error | 18291
+ html | empty-blob | 14391
+ pdf | wrong-mimetype | 14084
+ html | redirect-loop | 12856
+ xml | success | 10381
+ xml | no-capture | 10008
+ html | skip-url-blocklist | 3294
+ html | cdx-error | 3275
+ (30 rows)
+
+Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k
+PDFs with no attempt at all? Maybe a filter, or bogus URLs.
+
+Over 1.5M new PDF success over this crawl iteration period, nice.
diff --git a/notes/ingest/2022-07_targeted.md b/notes/ingest/2022-07_targeted.md
new file mode 100644
index 0000000..415f23b
--- /dev/null
+++ b/notes/ingest/2022-07_targeted.md
@@ -0,0 +1,140 @@
+
+Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs.
+
+ export PATCHDATE=2022-07-29
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+ COPY (
+ SELECT row_to_json(t) FROM (
+ SELECT ingest_file_result.terminal_url, ingest_request.*
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ -- AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ OR ingest_file_result.terminal_status_code = 429
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'dblp'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'pmc'
+ -- OR ingest_request.link_source = 'unpaywall'
+ -- OR ingest_request.link_source = 'oai'
+ )
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) t
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json';
+ => COPY 3524573
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+ | rg -v "\\\\" \
+ | jq -r .terminal_url \
+ | rg '://' \
+ | rg -i '^http' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+ => 3.11M 0:01:08 [45.4k/s]
+
+ # check top domains
+ cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+ 624948 doi.org
+ 382492 www.jstage.jst.go.jp
+ 275087 www.mdpi.com
+ 157134 www.persee.fr
+ 108979 www.sciencedirect.com
+ 94375 www.scielo.br
+ 50834 onlinelibrary.wiley.com
+ 49991 journals.lww.com
+ 30354 www.frontiersin.org
+ 27963 doaj.org
+ 27058 www.e-periodica.ch
+ 24147 dl.acm.org
+ 23389 aclanthology.org
+ 22086 www.research-collection.ethz.ch
+ 21589 medien.die-bonn.de
+ 18866 www.ingentaconnect.com
+ 18583 doi.nrct.go.th
+ 18271 repositories.lib.utexas.edu
+ 17634 hdl.handle.net
+ 16366 archives.datapages.com
+ 15146 cgscholar.com
+ 13987 dl.gi.de
+ 13188 www.degruyter.com
+ 12503 ethos.bl.uk
+ 12304 preprints.jmir.org
+
+ cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+ => done
+
+ scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+
+## Re-Ingest
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json
+ => 3.52M 0:01:37 [36.2k/s]
+
+Ingest:
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md
new file mode 100644
index 0000000..ac7c68f
--- /dev/null
+++ b/notes/ingest/2022-09_oaipmh.md
@@ -0,0 +1,397 @@
+
+Martin did another OAI-PMH bulk crawl, this time with the old JSON format: <https://archive.org/download/oai_harvest_20220921>
+
+I updated the transform script to block some additional domains.
+
+
+## Prep
+
+Fetch the snapshot:
+
+ cd /srv/sandcrawler/tasks/
+ wget https://archive.org/download/oai_harvest_20220921/2022-09-21-oai-pmh-metadata-compat.jsonl.zst
+
+Transform to ingest requests:
+
+ cd /srv/sandcrawler/src/python
+ git log | head -n1
+ # commit dfd4605d84712eccb95a63e50b0bcb343642b433
+
+ pipenv shell
+ zstdcat /srv/sandcrawler/tasks/2022-09-21-oai-pmh-metadata-compat.jsonl.zst \
+ | ./scripts/oai2ingestrequest.py - \
+ | pv -l \
+ | gzip \
+ > /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz
+ # 16.1M 1:01:02 [4.38k/s]
+
+Curious about types, though this would probably be handled at fatcat ingest
+time:
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.types[]' -r | sort | uniq -c | sort -nr > oai_type_counts.txt
+
+ head oai_type_counts.txt -n30
+ 5623867 info:eu-repo/semantics/article
+ 5334928 info:eu-repo/semantics/publishedVersion
+ 3870359 text
+ 1240225 Text
+ 829169 Article
+ 769849 NonPeerReviewed
+ 665700 PeerReviewed
+ 648740 Peer-reviewed Article
+ 547857 article
+ 482906 info:eu-repo/semantics/bachelorThesis
+ 353814 Thesis
+ 329269 Student thesis
+ 262650 info:eu-repo/semantics/conferenceObject
+ 185354 Journal articles
+ 162021 info:eu-repo/semantics/doctoralThesis
+ 152079 Journal Article
+ 150226 Research Article
+ 130217 Conference papers
+ 127255 Artículo revisado por pares
+ 124243 Newspaper
+ 123908 ##rt.metadata.pkp.peerReviewed##
+ 123309 Photograph
+ 122981 info:eu-repo/semantics/masterThesis
+ 116719 Book
+ 108946 Image
+ 108216 Report
+ 107946 Other
+ 103562 masterThesis
+ 103038 info:eu-repo/semantics/other
+ 101404 StillImage
+ [...]
+
+And formats:
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.formats[]' -r | sort | uniq -c | sort -nr > oai_format_counts.txt
+
+ head -n 20 oai_format_counts.txt
+ 11151928 application/pdf
+ 677413 text
+ 561656 text/html
+ 498518 image/jpeg
+ 231219 Text
+ 193638 text/xml
+ 147214 Image
+ 117073 image/jpg
+ 110872 pdf
+ 91323 image/tiff
+ 76948 bib
+ 75393 application/xml
+ 70244 Digitized from 35 mm. microfilm.
+ 68206 mods
+ 59227 PDF
+ 57677 application/epub+zip
+ 57602 application/octet-stream
+ 52072 text/plain
+ 51620 application/msword
+ 47227 audio/mpeg
+
+Also, just overall size (number of records):
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | wc -l
+ # 20,840,301
+
+Next load in to sandcrawler DB:
+
+ zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz | pv -l | ./persist_tool.py ingest-request -
+
+ Traceback (most recent call last):
+ File "./persist_tool.py", line 311, in <module>
+ main()
+ File "./persist_tool.py", line 307, in main
+ args.func(args)
+ File "./persist_tool.py", line 119, in run_ingest_request
+ pusher.run()
+ File "/1/srv/sandcrawler/src/python/sandcrawler/workers.py", line 397, in run
+ self.worker.push_batch(batch)
+ File "/1/srv/sandcrawler/src/python/sandcrawler/persist.py", line 342, in push_batch
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ File "/1/srv/sandcrawler/src/python/sandcrawler/db.py", line 459, in insert_ingest_request
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ File "/1/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/psycopg2/extras.py", line 1270, in execute_values
+ cur.execute(b''.join(parts))
+ psycopg2.errors.ProgramLimitExceeded: index row size 3400 exceeds btree version 4 maximum 2704 for index "ingest_request_base_url_idx"
+ DETAIL: Index row references tuple (6893121,3) in relation "ingest_request".
+ HINT: Values larger than 1/3 of a buffer page cannot be indexed.
+ Consider a function index of an MD5 hash of the value, or use full text indexing.
+ 15.7M 0:41:48 [6.27k/s]
+
+Darn, this means we won't get reasonable stats about how many rows were
+inserted/updated.
+
+Patched the persist tool to skip very long URLs, and ran again (backwards, just
+URLs which didn't get inserted already):
+
+ zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz \
+ | tac \
+ | head -n1000000 \
+ | pv -l \
+ | ./persist_tool.py ingest-request -
+ # 1.00M 0:03:04 [5.41k/s]
+ # Worker: Counter({'total': 1000000, 'insert-requests': 124701, 'skip-url-too-long': 1, 'update-requests': 0})
+
+Status of just the new lines:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+---------
+ | 6398455
+ success | 540219
+ no-pdf-link | 41316
+ link-loop | 23871
+ no-capture | 11350
+ redirect-loop | 8315
+ wrong-mimetype | 2394
+ terminal-bad-status | 1540
+ null-body | 1038
+ cdx-error | 272
+ empty-blob | 237
+ petabox-error | 213
+ wayback-error | 186
+ blocked-cookie | 107
+ timeout | 47
+ wayback-content-error | 26
+ spn2-cdx-lookup-failure | 21
+ skip-url-blocklist | 16
+ spn2-backoff | 15
+ body-too-large | 13
+ (20 rows)
+
+
+## Bulk Ingest
+
+Should already have filtered domains/prefixes in transform script, so not
+including filters here.
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/srv/sandcrawler/tasks/oai_noingest_20220921.rows.json';
+ # COPY 6398455
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_noingest_20220921.rows.json \
+ | pv -l \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json
+ # 6.40M 0:02:18 [46.2k/s]
+
+ cat /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # DONE
+
+Expect this ingest to take a week or so.
+
+Then, run stats again:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 3617175
+ success | 2775036
+ no-pdf-link | 449298
+ link-loop | 74260
+ terminal-bad-status | 47819
+ wrong-mimetype | 20195
+ redirect-loop | 18197
+ empty-blob | 12127
+ cdx-error | 3038
+ skip-url-blocklist | 2630
+ wayback-error | 2599
+ petabox-error | 2354
+ wayback-content-error | 1617
+ blocked-cookie | 1293
+ null-body | 1038
+ body-too-large | 670
+ | 143
+ bad-gzip-encoding | 64
+ timeout | 47
+ spn2-cdx-lookup-failure | 20
+ (20 rows)
+
+
+## Crawl Seedlist
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'redirect-loop'
+ OR ingest_file_result.status = 'terminal-bad-status'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'timeout'
+ OR ingest_file_result.status = 'wayback-content-error'
+ )
+ ) TO '/srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json';
+ => COPY 3692846
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | pv -l \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json
+ => 3.69M 0:01:19 [46.6k/s]
+
+This will be used for re-ingest later. For now, extract URLs:
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | jq .base_url -r \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+ => 3.66M 0:00:59 [61.8k/s]
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | rg '"terminal_url"' \
+ | jq -r .result.terminal_url \
+ | rg -v ^null$ \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+ => 0.00 0:00:05 [0.00 /s]
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+ | awk '{print "F+ " $1}' \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+What domains are we crawling?
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+ | sort -u -S 4G \
+ | cut -d/ -f3 \
+ | sort \
+ | uniq -c \
+ | sort -nr \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+
+ head -n20 /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+ 91899 raco.cat
+ 70116 islandora.wrlc.org
+ 68708 urn.kb.se
+ 63726 citeseerx.ist.psu.edu
+ 50370 publications.rwth-aachen.de
+ 44885 urn.nsk.hr
+ 38429 server15795.contentdm.oclc.org
+ 33041 periodicos.ufpb.br
+ 32519 nbn-resolving.org
+ 31990 www.ajol.info
+ 24745 hal.archives-ouvertes.fr
+ 22569 id.nii.ac.jp
+ 17239 tilburguniversity.on.worldcat.org
+ 15873 dspace.nbuv.gov.ua
+ 15436 digitalcommons.wustl.edu
+ 14885 www.iiste.org
+ 14623 www.manchester.ac.uk
+ 14033 nbn-resolving.de
+ 13999 opus4.kobv.de
+ 13689 www.redalyc.org
+
+Sizes:
+
+ wc -l /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+ 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+ 0 /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+ 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+
+Copy seedlist to crawler:
+
+ # as regular user
+ scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp
+
+## Post-Crawl Bulk Ingest
+
+ # ran 2022-11-16, after crawl cleanup
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -----------------------+---------
+ success | 4721164 +1,946,128
+ no-pdf-link | 1116290
+ no-capture | 673939
+ terminal-bad-status | 232217
+ link-loop | 148544
+ wrong-mimetype | 68841
+ redirect-loop | 26262
+ empty-blob | 17759
+ cdx-error | 6570
+ blocked-cookie | 4026
+ blocked-wall | 3054
+ skip-url-blocklist | 2924
+ body-too-large | 2404
+ bad-redirect | 1565
+ wayback-error | 1320
+ petabox-error | 1083
+ null-body | 1038
+ wayback-content-error | 264
+ bad-gzip-encoding | 150
+ | 143
+ (20 rows)
+
diff --git a/notes/ingest/NEXT.md b/notes/ingest/NEXT.md
new file mode 100644
index 0000000..8cdd6df
--- /dev/null
+++ b/notes/ingest/NEXT.md
@@ -0,0 +1,52 @@
+
+biorxiv
+medrxiv
+ doi:10.1101\/20*
+
+persee.fr 147k
+ publisher:persee in_ia:false is_oa:true
+ https://www.persee.fr/doc/pumus_1164-5385_1992_num_2_1_1013
+
+cairn.info: 161k
+ doi_prefix:10.3917 in_ia:false is_oa:true
+ https://www.cairn.info/revue-afrique-contemporaine-2011-3-page-161.htm
+ https://www.cairn.info/revue-cahiers-de-psychologie-clinique-2014-1-page-209.htm
+
+IOP OA: 169k
+ doi_prefix:10.1088 is_oa:true in_ia:false
+
+indian journals platform? 124k
+ doi_prefix:10.4103 in_ia:false is_oa:true
+ http://www.urologyannals.com/article.asp?issn=0974-7796;year=2011;volume=3;issue=3;spage=138;epage=140;aulast=Ahmad
+ http://www.neurologyindia.com/article.asp?issn=0028-3886;year=2011;volume=59;issue=4;spage=612;epage=615;aulast=Utsuki
+
+openedition? 48k
+ doi_prefix:10.4000 is_oa:true in_ia:false
+
+german medical science (GMS) 28k
+ doi_prefix:10.3205 in_ia:false is_oa:true
+ https://www.egms.de/static/en/journals/zma/2015-32/zma000965.shtml
+
+siberian chemistry 28k
+ doi_prefix:10.2298 in_ia:false is_oa:true
+ http://www.doiserbia.nb.rs/Article.aspx?ID=0352-51391000105H
+
+jalc oa doi: 82k
+ doi_registrar:jalc in_ia:false is_oa:true
+
+sage OA papers
+ https://journals.sagepub.com/doi/10.1177/034003529802400510
+
+Scientific Reports: 25k
+ in_ia:false container_id:"tnqhc2x2aneavcd3gx5h7mswhm"
+
+U Toronto press: 23k
+ publisher:"Toronto Press" in_ia:false is_oa:true
+ has an annoying bounce page
+
+ASHA (speech-language-hearing association): 7k
+ publisher:Speech-Language-Hearing in_ia:false is_oa:true
+
+MIT press journals
+
+
diff --git a/notes/ingest/es_csv_to_json.py b/notes/ingest/es_csv_to_json.py
new file mode 100755
index 0000000..4cd1811
--- /dev/null
+++ b/notes/ingest/es_csv_to_json.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+"""
+ input like:
+
+ doi,ident,"release_stage"
+ "10.7554/elife.38904",mxj534diw5gatc26rkif3io5xm,published
+ "10.7554/elife.41855",kag74qc6dfex7ftpfkf7iaus44,published
+ "10.7554/elife.41156",ienee5vxcbbbfhs2q54h4455hu,published
+ "10.7554/elife.43230",52rpllol2rcndjqs3xfwcldeka,published
+ "10.7554/elife.42591",fpz642gihrc3jd2vibg6gnjrxm,published
+
+ output like:
+
+ {
+ "base_url": "https://doi.org/10.7554/elife.38904",
+ "ext_ids": {
+ "doi": "10.7554/elife.38904"
+ },
+ "fatcat_release": "mxj534diw5gatc26rkif3io5xm",
+ "release_stage": "published"
+ }
+"""
+
+import csv, sys, json
+
+reader = csv.DictReader(sys.stdin)
+for row in reader:
+ d = {
+ "base_url": "https://doi.org/{}".format(row['doi']),
+ "ext_ids": {
+ "doi": row['doi'],
+ },
+ "fatcat_release": row['ident'],
+ "release_stage": row['release_stage'],
+ }
+ print(json.dumps(d))
diff --git a/notes/ingest_domains.txt b/notes/ingest_domains.txt
new file mode 100644
index 0000000..ae06272
--- /dev/null
+++ b/notes/ingest_domains.txt
@@ -0,0 +1,294 @@
+
+## Queries to find broken domains
+
+Top domains with failed ingests:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+Status overview for a particular domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'osapublishing.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT domain, terminal_status_code, COUNT((domain, terminal_status_code))
+ FROM (SELECT terminal_status_code, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'osapublishing.org'
+ AND t1.terminal_status_code is not null
+ GROUP BY domain, terminal_status_code
+ ORDER BY COUNT DESC;
+
+Sample recent failures:
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%osapublishing.org%'
+ AND status = 'terminal-bad-status'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+
+## Failing
+
+www.osapublishing.org
+
+ this publisher (The Optical Society) is systemically using a CAPTCHA to
+ gate access to PDFs. bummer! could ask them to white-list?
+
+ has citation_pdf_url, so that isn't an issue
+
+ status: "no-pdf-link"
+ hops:
+ "https://doi.org/10.1364/optica.6.000798",
+ "https://www.osapublishing.org/viewmedia.cfm?uri=optica-6-6-798&seq=0"
+ "https://www.osapublishing.org/captcha/?guid=830CEAB5-09BD-6140-EABD-751200C78B1C"
+
+ domain | status | count
+ -----------------------+---------------------+-------
+ www.osapublishing.org | no-capture | 16680
+ www.osapublishing.org | no-pdf-link | 373
+ www.osapublishing.org | redirect-loop | 19
+ www.osapublishing.org | terminal-bad-status | 5
+ www.osapublishing.org | cdx-error | 1
+ www.osapublishing.org | wrong-mimetype | 1
+ www.osapublishing.org | spn-error | 1
+ www.osapublishing.org | success | 1
+ www.osapublishing.org | wayback-error | 1
+ (9 rows)
+
+www.persee.fr
+
+ Seems to be mostly blocking or rate-limiting?
+
+ domain | status | count
+ ---------------+-------------------------------------+-------
+ www.persee.fr | no-capture | 37862
+ www.persee.fr | terminal-bad-status | 3134
+ www.persee.fr | gateway-timeout | 2828
+ www.persee.fr | no-pdf-link | 431
+ www.persee.fr | spn-error | 75
+ www.persee.fr | redirect-loop | 23
+ www.persee.fr | success | 8
+ www.persee.fr | spn2-error | 2
+ www.persee.fr | spn2-error:soft-time-limit-exceeded | 1
+ www.persee.fr | wrong-mimetype | 1
+ (10 rows)
+
+journals.openedition.org
+
+ PDF access is via "freemium" subscription. Get redirects to:
+
+ https://auth.openedition.org/authorized_ip?url=http%3A%2F%2Fjournals.openedition.org%2Fnuevomundo%2Fpdf%2F61053
+
+ Content is technically open access (HTML and license; for all content?),
+ but can't be crawled as PDF without subscription.
+
+ domain | status | count
+ --------------------------+-------------------------+-------
+ journals.openedition.org | redirect-loop | 29587
+ journals.openedition.org | success | 6821
+ journals.openedition.org | no-pdf-link | 1507
+ journals.openedition.org | no-capture | 412
+ journals.openedition.org | wayback-error | 32
+ journals.openedition.org | wrong-mimetype | 27
+ journals.openedition.org | terminal-bad-status | 13
+ journals.openedition.org | spn2-cdx-lookup-failure | 4
+ journals.openedition.org | spn-remote-error | 1
+ journals.openedition.org | null-body | 1
+ journals.openedition.org | cdx-error | 1
+ (11 rows)
+
+journals.lww.com
+
+ no-pdf-link
+
+ domain | status | count
+ ------------------+----------------+-------
+ journals.lww.com | no-pdf-link | 11668
+ journals.lww.com | wrong-mimetype | 131
+ (2 rows)
+
+ doi prefix: 10.1097
+
+ <meta name="wkhealth_pdf_url" content="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf" />
+ data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw=="
+
+ Some weird thing going on, maybe they are blocking-via-redirect based on
+ our User-Agent? Seems like wget works, so funny that they don't block that.
+
+musewide.aip.de
+
+ no-pdf-link
+
+koreascience.or.kr | no-pdf-link | 8867
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'osapublishing.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%osapublishing.org%'
+ AND status = 'terminal-bad-status'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+www.cairn.info | link-loop | 8717
+
+easy.dans.knaw.nl | no-pdf-link | 8262
+scielo.conicyt.cl | no-pdf-link | 7925
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'scielo.conicyt.cl'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%scielo.conicyt.cl%'
+ AND status = 'terminal-bad-status'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+
+ domain | status | count
+ -------------------+---------------------+-------
+ scielo.conicyt.cl | no-pdf-link | 7926
+ scielo.conicyt.cl | success | 4972
+ scielo.conicyt.cl | terminal-bad-status | 1474
+ scielo.conicyt.cl | wrong-mimetype | 6
+ scielo.conicyt.cl | no-capture | 4
+ scielo.conicyt.cl | null-body | 1
+
+
+ pdf | https://doi.org/10.4067/s0370-41061980000300002 | 2020-02-22 23:55:56.235822+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0370-41061980000300002&lng=en&nrm=iso&tlng=en | 20200212201727 | 200 |
+ pdf | https://doi.org/10.4067/s0718-221x2019005000201 | 2020-02-22 23:01:49.070104+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-221X2019005000201&lng=en&nrm=iso&tlng=en | 20200214105308 | 200 |
+ pdf | https://doi.org/10.4067/s0717-75262011000200002 | 2020-02-22 22:49:36.429717+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-75262011000200002&lng=en&nrm=iso&tlng=en | 20200211205804 | 200 |
+ pdf | https://doi.org/10.4067/s0717-95022006000400029 | 2020-02-22 22:33:07.761766+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-95022006000400029&lng=en&nrm=iso&tlng=en | 20200209044048 | 200 |
+
+ These seem, on retry, like success? Maybe previous was a matter of warc/revisit not getting handled correctly?
+
+ pdf | https://doi.org/10.4067/s0250-71611998007100009 | 2020-02-22 23:57:16.481703+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0250-71611998007100009&lng=en&nrm=iso&tlng=en | 20200212122939 | 200 |
+ pdf | https://doi.org/10.4067/s0716-27902005020300006 | 2020-02-22 23:56:01.247616+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0716-27902005020300006&lng=en&nrm=iso&tlng=en | 20200214192151 | 200 |
+ pdf | https://doi.org/10.4067/s0718-23762005000100015 | 2020-02-22 23:53:55.81526+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-23762005000100015&lng=en&nrm=iso&tlng=en | 20200214173237 | 200 |
+
+ Look like web/xml only.
+
+ TODO: XML ingest (and replay?) support. These are as "<article>", not sure if that is JATS or what.
+
+www.kci.go.kr | no-pdf-link | 6842
+www.m-hikari.com | no-pdf-link | 6763
+cshprotocols.cshlp.org | no-pdf-link | 6553
+www.bibliotekevirtual.org | no-pdf-link | 6309
+data.hpc.imperial.ac.uk | no-pdf-link | 6071
+projecteuclid.org | link-loop | 5970
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'projecteuclid.org'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%projecteuclid.org%'
+ AND status = 'link-loop'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ -------------------+-------------------------+-------
+ projecteuclid.org | link-loop | 5985
+ projecteuclid.org | success | 26
+ projecteuclid.org | wayback-error | 26
+ projecteuclid.org | wrong-mimetype | 17
+ projecteuclid.org | spn2-cdx-lookup-failure | 4
+ projecteuclid.org | other-mimetype | 4
+ projecteuclid.org | no-capture | 3
+ projecteuclid.org | terminal-bad-status | 2
+ projecteuclid.org | spn2-error:job-failed | 1
+ projecteuclid.org | spn-remote-error | 1
+ (10 rows)
+
+ Doing a cookie check and redirect.
+
+ TODO: brozzler behavior to "click the link" instead?
+
+www.scielo.br | no-pdf-link | 5823
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain = 'www.scielo.br'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC;
+
+ SELECT * FROM ingest_file_result
+ WHERE terminal_url LIKE '%www.scielo.br%'
+ AND status = 'no-pdf-link'
+ ORDER BY updated DESC
+ LIMIT 10;
+
+ domain | status | count
+ ---------------+-------------------------+-------
+ www.scielo.br | success | 35150
+ www.scielo.br | no-pdf-link | 5839
+ www.scielo.br | terminal-bad-status | 429
+ www.scielo.br | no-capture | 189
+ www.scielo.br | wrong-mimetype | 7
+ www.scielo.br | spn2-cdx-lookup-failure | 2
+ (6 rows)
+
+ Seems to just be the subset with no PDFs.
+
+get.iedadata.org | no-pdf-link | 5822
+www.pdcnet.org | no-pdf-link | 5798
+publications.rwth-aachen.de | no-pdf-link | 5323
+www.sciencedomain.org | no-pdf-link | 5231
+medicalforum.ch | terminal-bad-status | 4574
+jrnl.nau.edu.ua | link-loop | 4145
+ojs.academypublisher.com | no-pdf-link | 4017
+
+## MAG bulk ingest
+
+- dialnet.unirioja.es | redirect-loop | 240967
+ dialnet.unirioja.es | terminal-bad-status | 20320
+ => may be worth re-crawling via heritrix?
+- agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639
+ => and other *.onlinelibrary.wiley.com
+- www.researchgate.net | redirect-loop | 42859
+- www.redalyc.org:9081 | no-pdf-link | 10515
+- www.repository.naturalis.nl | redirect-loop | 8213
+- bjp.rcpsych.org | link-loop | 8045
+- journals.tubitak.gov.tr | wrong-mimetype | 7159
+- www.erudit.org | redirect-loop | 6819
+- papers.ssrn.com | redirect-loop | 27328
+ => blocking is pretty aggressive, using cookies or referrer or something.
+ maybe a brozzler behavior would work, but doesn't currently
+
+## Out of Scope
+
+Datasets only?
+
+- plutof.ut.ee
+- www.gbif.org
+- doi.pangaea.de
+- www.plate-archive.org
+
+Historical non-paper content:
+
+- dhz.uni-passau.de (newspapers)
+- digital.ucd.ie (irish historical)
+
+Mostly datasets (some PDF content):
+
+- *.figshare.com
+- zenodo.com
+- data.mendeley.com
diff --git a/notes/job_log.txt b/notes/job_log.txt
deleted file mode 100644
index 06490d2..0000000
--- a/notes/job_log.txt
+++ /dev/null
@@ -1,103 +0,0 @@
-
-### QA matchcrossref
-
-[D8C7F2CA7620450991838D540489948D/8B17786779BE44579C98D8A325AC5959] sandcrawler.ScoreJob/(1/1) ...-24-2102.32-matchcrossref
-
-Submitted: Fri Aug 24 21:03:09 UTC 2018
-Started: Fri Aug 24 21:03:20 UTC 2018
-Finished: Sat Aug 25 09:46:55 UTC 2018
-Elapsed: 12hrs, 43mins, 34sec
-Diagnostics:
-Average Map Time 24mins, 31sec
-Average Shuffle Time 15sec
-Average Merge Time 21sec
-Average Reduce Time 7mins, 17sec
-
-Map 2312 2312
-Reduce 100 100
-
-crossref-rows-filtered 73901964 0 73901964
-grobid-rows-filtered 1092992 0 1092992
-joined-rows 0 623837 623837
-
-cascading.flow.StepCounters
-Tuples_Read 94831255 0 94831255
-Tuples_Written 0 623837 623837
-
-Read_Duration 7108430 352241 7460671
-Tuples_Read 94831255 74994956 169826211
-Tuples_Written 74994956 623837 75618793
-Write_Duration 7650302 21468 7671770
-
-## QA UnGrobided
-
-Submitted: Sat Aug 25 01:23:22 UTC 2018
-Started: Sat Aug 25 05:06:36 UTC 2018
-Finished: Sat Aug 25 05:13:45 UTC 2018
-Elapsed: 7mins, 8sec
-Diagnostics:
-Average Map Time 1mins, 20sec
-Average Shuffle Time 12sec
-Average Merge Time 15sec
-Average Reduce Time 29sec
-
-Map 48 48
-Reduce 1 1
-
-bnewbold@bnewbold-dev$ gohdfs du -sh sandcrawler/output-qa/2018-08-25-0122.54-dumpungrobided/part*
-56.8M /user/bnewbold/sandcrawler/output-qa/2018-08-25-0122.54-dumpungrobided/part-00000
-
-## Prod UnGrobided
-
-[D76F6BF91D894E879E747C868B0DEDE7/394A1AFC44694992B71E6920AF8BA3FB] sandcrawler.DumpUnGrobidedJob/(1/1) ...26-0910.25-dumpungrobided
-
-Map 278 278
-Reduce 1 1
-
-Submitted: Sun Aug 26 09:10:51 UTC 2018
-Started: Sun Aug 26 09:18:21 UTC 2018
-Finished: Sun Aug 26 10:29:28 UTC 2018
-Elapsed: 1hrs, 11mins, 7sec
-Diagnostics:
-Average Map Time 4mins, 48sec
-Average Shuffle Time 24mins, 17sec
-Average Merge Time 14sec
-Average Reduce Time 13mins, 54sec
-
-
-cading.flow.StepCounters
-Name
-Map
-Reduce
-Total
-Tuples_Read 64510564 0 64510564
-Tuples_Written 0 21618164 21618164
-
-## Prod Crossref Match
-
-[6C063C0809244446BA8602C3BE99CEC2/5FE5D87899154F38991A1ED58BEB34D4] sandcrawler.ScoreJob/(1/1) ...-25-1753.01-matchcrossref
-
-Map 2427 2427
-Reduce 50 50
-
-Submitted: Sat Aug 25 17:53:50 UTC 2018
-Started: Sat Aug 25 17:53:59 UTC 2018
-Finished: Sun Aug 26 11:22:52 UTC 2018
-Elapsed: 17hrs, 28mins, 52sec
-Diagnostics:
-Average Map Time 31mins, 20sec
-Average Shuffle Time 1mins, 21sec
-Average Merge Time 41sec
-Average Reduce Time 3hrs, 14mins, 39sec
-
-crossref-rows-filtered 73901964 0 73901964
-grobid-rows-filtered 14222226 0 14222226
-joined-rows 0 14115453 14115453
-
-## "Prod" Fatcat Group Works (run 2019-08-10)
-
- ./please --prod groupworks-fatcat hdfs:///user/bnewbold/release_export.2019-07-07.json
-
- job_1559844455575_118299
- http://ia802401.us.archive.org:6988/proxy/application_1559844455575_118299
-
diff --git a/notes/possible_ingest_targets.txt b/notes/possible_ingest_targets.txt
new file mode 100644
index 0000000..fcdc3e4
--- /dev/null
+++ b/notes/possible_ingest_targets.txt
@@ -0,0 +1,15 @@
+
+- all releases from small journals, regardless of OA status, if small (eg, less than 200 papers published), and not big5
+
+more complex crawling/content:
+- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url
+- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data)
+- doi.ala.org.au: possible dataset ingest source
+- peerj.com, at least reviews, should be HTML ingest? or are some PDF?
+- publons.com should be HTML ingest, possibly special case for scope
+- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug
+
+other tasks:
+- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+- push/deploy sandcrawler changes
diff --git a/notes/tasks/2020-01-06_heuristic_cdx.txt b/notes/tasks/2020-01-06_heuristic_cdx.txt
new file mode 100644
index 0000000..209fa4f
--- /dev/null
+++ b/notes/tasks/2020-01-06_heuristic_cdx.txt
@@ -0,0 +1,37 @@
+
+Wanted to include a large number of additional CDX lines based on regex
+pattern. These are primarily .edu domains with things that look like user
+accounts *and* .pdf file extensions in the path.
+
+## Commands
+
+aitio:/fast/gwb_pdfs
+
+ pdfs/gwb-pdf-20191005172329-url-heuristics-edu
+ pdfs/gwb-pdf-20191005172329-url-heuristics
+
+
+to filter as url/sha1 uniq:
+
+ cat raw.cdx | sort -u -t' ' -k3,6 -S 4G > uniq.cdx
+
+ cat gwb-pdf-20191005172329-url-heuristics-edu/part-r-000* | sort -u -t' ' -k3,6 -S 4G > gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx
+ cat gwb-pdf-20191005172329-url-heuristics/part-r-000* | sort -u -t' ' -k3,6 -S 4G > gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx
+
+ 7241795 gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx
+ 41137888 gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx
+
+ cut -d' ' -f6 gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx | sort -u -S 4G | wc -l
+ 7241795
+
+ cut -d' ' -f6 gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx | sort -u -S 4G | wc -l
+ 41137888
+
+ ./persist_tool.py cdx /fast/gwb_pdf/gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx
+ Worker: Counter({'total': 7239153, 'insert-cdx': 6845283, 'update-cdx': 0})
+ CDX lines pushed: Counter({'total': 7241795, 'pushed': 7239153, 'skip-parse': 2603, 'skip-mimetype': 39})
+
+ ./persist_tool.py cdx /fast/gwb_pdf/gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx
+ Worker: Counter({'total': 41030360, 'insert-cdx': 22430064, 'update-cdx': 0})
+ CDX lines pushed: Counter({'total': 41137888, 'pushed': 41030360, 'skip-mimetype': 87341, 'skip-parse': 20187})
+
diff --git a/notes/tasks/2020-01-27_cleanup_cdx.md b/notes/tasks/2020-01-27_cleanup_cdx.md
new file mode 100644
index 0000000..54db92e
--- /dev/null
+++ b/notes/tasks/2020-01-27_cleanup_cdx.md
@@ -0,0 +1,34 @@
+
+Accidentally seem to have backfilled many CDX lines with non-PDF content.
+Should clear these out!
+
+Something like:
+
+ mimetype = 'text/html'
+ not in file_meta
+
+Or maybe instead:
+
+ mimetype = 'text/html'
+ not in file_meta
+
+SQL:
+
+ SELECT * FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01' LIMIT 5;
+ SELECT COUNT(1) FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01';
+ => 24841846
+
+ SELECT * FROM cdx LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL LIMIT 5;
+ SELECT COUNT(1) FROM cdx LEFT JOIN file_meta ON cdx.sha1hex = file_meta.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL;
+ => 24547552
+
+ DELETE FROM cdx
+ WHERE sha1hex IN
+ (SELECT cdx.sha1hex
+ FROM cdx
+ LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex
+ WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL);
+ => DELETE 24553428
+
+Slightly more... probably should have had a "AND cdx.mimetype = 'text/html'" in
+the DELETE WHERE clause.
diff --git a/notes/tasks/2020-01-27_grobid_backfill.md b/notes/tasks/2020-01-27_grobid_backfill.md
new file mode 100644
index 0000000..d70e203
--- /dev/null
+++ b/notes/tasks/2020-01-27_grobid_backfill.md
@@ -0,0 +1,40 @@
+
+Recently added a bunch of PDFs to sandcrawler-db. Want to GROBID extract the
+~15m which haven't been processed yet. Also want to re-GROBID a batch of
+PDFs-in-zipfiles from archive.org; will probably also want to re-GROBID other
+petabox files soon.
+
+## pre-1923 zipfile re-extraction
+
+Exact commands (in parallel):
+
+ fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \
+ parallel -j16 --progress --joblog extract_tasks.log --resume-failed \
+ './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
+
+ fd .zip /srv/sandcrawler/tasks/crossref-pre-1923-scholarly-works/ | \
+ parallel -j16 --progress --joblog extract_tasks_1923.log --resume-failed \
+ './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}'
+
+## petabox re-extraction
+
+This was run around 2020-02-03. There are a few million remaining PDFs that
+have only partial file metadata (`file_meta`), meaning run with old version of
+sandcrawler code. Want to get them all covered, maybe even DELETE the missing
+ones, so re-grobiding petabox-only files.
+
+There are about 2,887,834 files in petabox, only 46,232 need re-processing (!).
+
+ psql sandcrawler < dump_regrobid_pdf_petabox.sql
+ cat dump_regrobid_pdf_petabox.2020-02-03.json | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf_petabox.2020-02-03.uniq.json
+
+This is pretty few... maybe even would have been caught by wayback backfill?
+
+Small start:
+
+ head /srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.uniq.json | ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+Full batch, 25x parallel:
+
+ cat /srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.uniq.json | pv -l | parallel -j25 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
diff --git a/notes/tasks/2020-02-14_pdftrio.md b/notes/tasks/2020-02-14_pdftrio.md
new file mode 100644
index 0000000..e6f8d8e
--- /dev/null
+++ b/notes/tasks/2020-02-14_pdftrio.md
@@ -0,0 +1,162 @@
+
+First end-to-end `pdf_trio` results!
+
+## Source
+
+Will use AIT partner #1830 (U Alberta) CDX as input. These are unique by
+digest, about 100k.
+
+ ArchiveIt-Collection-1830.download.cdx
+
+## Testing/Prep
+
+Versions/setup:
+
+ sandcrawler: f613f69a40fcc9a445f21cadd35d7c36c8061db8
+ => patched to 'auto' mode
+
+ pdf_trio: 03bd3fdc15418462b2b1582e4f967f26ddcb43e2
+
+ pdftrio: 'auto' mode
+
+ uwsgi: 16x processes
+
+ sudo docker run --rm -p 8501:8501 -e TF_XLA_FLAGS=--tf_xla_cpu_global_jit -e KMP_AFFINITY=granularity=fine,compact,1,0 -e KMP_BLOCKTIME=0 -e OMP_NUM_THREADS=24 -e TENSORFLOW_INTER_OP_PARALLELISM=1 -e TENSORFLOW_INTRA_OP_PARALLELISM=24 -v /srv/pdftrio//models/bert_models:/models/bert_model -v /srv/pdftrio//models/pdf_image_classifier_model:/models/image_model -v /srv/pdftrio//config/tfserving_models_docker.config:/models/tfserving_models.config -v /srv/pdftrio/config/tfserving_batch.config:/models/tfserving_batch.config --name pdftrio-tfserving tensorflow/serving --model_config_file=/models/tfserving_models.config --enable_batching=true --batching_parameters_file=/models/tfserving_batch.config
+
+Basic testing::
+
+ head -n100 /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j20 --pipe --linebuffer ./pdftrio_tool.py --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx - | jq .
+
+ head -n100 /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j20 --pipe --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+ => Running in kafka output mode, publishing to sandcrawler-qa.pdftrio-output
+
+
+On the persist side:
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t sandcrawler-qa.pdftrio-output | head | jq .
+ => looks fine
+
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org --env qa persist-pdftrio
+ => Consuming from kafka topic sandcrawler-qa.pdftrio-output, group persist-pdftrio
+
+Ah, don't forget, start persist before writing to topic! Or would need to reset
+offsets to start.
+
+Seems to be only a single pdftext instance running? Very low CPU
+
+ head -n500 /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j40 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+That is much better! CPU still not pegged, so maybe could do 50x processes? Lots of I/O wait. Blech.
+
+Zero ("0") not getting persisted for any columns (fixed in sandcrawler/db.py)
+
+`models_date` not getting set. Added `PDFTRIO_MODELS_DATE="2020-01-01"` to env. (TODO: ansible)
+
+## Prod Run
+
+ ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org --env prod persist-pdftrio
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j40 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+Worker CPU basically blocked on pdftotext, multiple 100% CPU. Presumably I/O
+wait? Though not totally sure.
+
+htop:
+
+ PID USER PRI NI VIRT RES SHR S CPU% MEM% TIME+ Command
+ 17951 pdftrio 20 0 51756 12868 5856 R 90.1 0.0 0:06.61 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p
+ 17870 pdftrio 20 0 52004 12964 5684 R 87.4 0.0 0:08.61 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p
+ 13735 root 20 0 10.4G 3815M 4144 S 79.6 7.6 48h02:37 tensorflow_model_server --port=8500 --rest_api_port=850
+ 14522 pdftrio 20 0 2817M 1331M 16896 R 43.1 2.6 0:57.75 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 18027 pdftrio 20 0 49192 10692 6116 R 39.8 0.0 0:00.61 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p
+ 14518 pdftrio 20 0 2818M 1336M 16836 S 33.3 2.7 0:47.46 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14504 pdftrio 20 0 2731M 1310M 13164 D 32.6 2.6 0:34.81 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14526 pdftrio 20 0 2816M 1333M 16832 R 28.7 2.7 0:57.22 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14500 pdftrio 20 0 2729M 1306M 13160 R 20.9 2.6 0:22.57 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14492 pdftrio 20 0 2729M 1307M 13156 S 17.6 2.6 0:17.91 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14508 pdftrio 20 0 2734M 1312M 14380 D 14.4 2.6 0:38.75 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14496 pdftrio 20 0 2728M 1300M 13160 S 13.7 2.6 0:18.00 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 17314 sandcrawl 20 0 56668 18228 4304 D 13.7 0.0 0:02.31 perl /usr/bin/parallel -j40 -N1 --pipe --round-robin --
+ 14472 pdftrio 20 0 2725M 1283M 13136 S 12.4 2.6 0:05.69 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14513 pdftrio 20 0 2730M 1309M 14300 S 11.1 2.6 0:40.32 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14480 pdftrio 20 0 2725M 1291M 13144 S 10.4 2.6 0:08.77 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14488 pdftrio 20 0 2725M 1294M 13152 S 9.8 2.6 0:08.18 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 14468 pdftrio 20 0 2717M 1271M 13088 S 6.5 2.5 0:02.42 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 17411 sandcrawl 20 0 556M 53840 14936 S 6.5 0.1 0:01.57 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 14530 pdftrio 20 0 2524M 1252M 3492 S 4.6 2.5 0:12.72 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini
+ 7311 bnewbold 20 0 27716 5520 3128 R 3.9 0.0 0:41.59 htop
+ 17444 sandcrawl 20 0 552M 50456 14892 S 3.9 0.1 0:01.54 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 18042 pdftrio 20 0 46068 6588 5328 R 3.3 0.0 0:00.05 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p
+ 18043 pdftrio 20 0 4 4 0 R 2.6 0.0 0:00.04
+ 2203 grobid 20 0 6334M 126M 4188 S 0.7 0.3 3h27:32 /usr/lib/jvm/java-8-openjdk-amd64/bin/java -XX:MaxMetas
+ 17419 sandcrawl 20 0 619M 116M 15248 S 0.7 0.2 0:02.68 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17440 sandcrawl 20 0 578M 76948 15160 S 0.7 0.1 0:01.54 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 13848 root 20 0 0 0 0 D 0.7 0.0 0:00.69 kworker/u60:1
+ 17443 sandcrawl 20 0 578M 76500 14912 S 0.7 0.1 0:01.74 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17414 sandcrawl 20 0 580M 77720 15036 S 0.0 0.2 0:01.77 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17432 sandcrawl 20 0 563M 61460 14976 S 0.0 0.1 0:01.59 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17442 sandcrawl 20 0 561M 53096 15240 S 0.0 0.1 0:01.47 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17433 sandcrawl 20 0 559M 57160 15176 S 0.0 0.1 0:01.52 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17431 sandcrawl 20 0 554M 50960 14892 S 0.0 0.1 0:01.37 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+ 17413 sandcrawl 20 0 554M 52376 14920 S 0.0 0.1 0:01.57 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa -
+
+dstat:
+
+ ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
+ usr sys idl wai hiq siq| read writ| recv send| in out | int csw
+ 32 9 22 37 0 0| 0 37M| 20M 12M| 0 0 | 35k 64k
+ 20 6 24 50 0 0| 0 20M| 30M 5662k| 0 0 | 27k 48k
+ 27 7 24 43 0 0| 0 26M|8712k 6289k| 0 0 | 21k 114k
+ 30 8 23 38 0 0|4096B 61M| 17M 20M| 0 0 | 31k 54k
+ 33 6 17 44 0 0| 0 32M| 14M 6384k| 0 0 | 27k 46k
+ 25 6 24 44 0 0| 0 19M| 18M 13M| 0 0 | 27k 179k
+ 40 6 19 35 0 0|8192B 25M|7855k 6661k| 0 0 | 31k 85k
+ 59 8 12 20 0 0| 0 39M|4177k 33M| 0 0 | 34k 64k
+ 34 4 17 44 0 0| 0 16M|7527k 11M| 0 0 | 22k 45k
+ 44 7 17 32 0 0| 0 30M| 20M 291k| 0 0 | 36k 62k
+
+Create tmpfs:
+
+ sudo mkdir -p /pdftrio-ramdisk
+ #sudo mount -t tmpfs -o size=2g tmpfs /pdftrio-ramdisk
+ sudo mount -t tmpfs -o size=6g tmpfs /pdftrio-ramdisk
+
+add to pdftrio config env and restart:
+
+ TEMP=/run/pdf_trio
+
+Seems to have worked. Pretty much maxed CPU, may need to back-off parallelism. Doing more than 31/sec.
+
+Errors were not getting encoded correctly:
+
+ File "/fast/sandcrawler/python/sandcrawler/persist.py", line 331, in push_batch
+ r['pdf_trio']['key'] = r['key']
+ KeyError: 'pdf_trio'
+
+Fixed in sandcrawler worker, and patched persist to work around this.
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j30 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+Wow, 30x parallelism waaaay less?
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j30 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+What changed? Confused. Load average was like 40.
+
+Via kafka, as much as 69.71/sec! Errors?
+
+Hrm, this whole `auto` thing. I am very skeptical. Should also do a run as `all`, -j20.
+
+ Worker: Counter({'total': 1916, 'pushed': 1916})
+ CDX lines pushed: Counter({'total': 1934, 'pushed': 1916, 'skip-parse': 18})
+
+Hit some bugs, causing failure, but still seem to have processed a good chunk.
+
+Switched to `all`, running a different batch:
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1914.download.cdx | parallel -j20 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
+After flag change, another batch in `all`:
+
+ time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-2566.download.cdx | parallel -j20 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx -
+
diff --git a/notes/tasks/2020-07-22_processing_holes.md b/notes/tasks/2020-07-22_processing_holes.md
new file mode 100644
index 0000000..70e2b59
--- /dev/null
+++ b/notes/tasks/2020-07-22_processing_holes.md
@@ -0,0 +1,120 @@
+
+Want to clean up missing/partial processing (GROBID, `pdf_meta`, `file_meta`)
+in sandcrawler database.
+
+
+## `pdf_meta` for petabox rows
+
+Ran `dump_unextracted_pdf_petabox.sql` SQL, which resulted in a .json file.
+
+ wc -l dump_unextracted_pdf_petabox.2020-07-22.json
+ 1503086 dump_unextracted_pdf_petabox.2020-07-22.json
+
+Great, 1.5 million, not too many. Start small:
+
+ head -n1000 dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+Full batch:
+
+ cat dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+Re-ran on 2020-08-19:
+
+ wc -l dump_unextracted_pdf_petabox.2020-08-19.json
+ 971194 dump_unextracted_pdf_petabox.2020-08-19.json
+
+## `pdf_meta` missing CDX rows
+
+First, the GROBID-ized rows but only if has a fatcat file as well.
+
+10,755,365! That is a lot still to process.
+
+ cat dump_unextracted_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+Re-ran on 2020-08-19:
+
+ wc -l dump_unextracted_pdf.fatcat.2020-08-19.json
+ 65517 dump_unextracted_pdf.fatcat.2020-08-19.json
+
+Enqueued!
+
+## `GROBID` missing petabox rows
+
+ wc -l /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json
+ 972221 /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json
+
+Start small:
+
+ head -n1000 dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Full batch:
+
+ cat dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Re-ran on 2020-08-19:
+
+ wc -l dump_ungrobided_pdf_petabox.2020-08-19.json
+ 933 dump_ungrobided_pdf_petabox.2020-08-19.json
+
+Enqueued!
+
+## `GROBID` for missing CDX rows in fatcat
+
+ wc -l dump_ungrobided_pdf.fatcat.2020-07-22.json
+ 1808580 dump_ungrobided_pdf.fatcat.2020-07-22.json
+
+Full batch:
+
+ cat dump_ungrobided_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+## `GROBID` for bad status
+
+Eg, wayback errors.
+
+TODO
+
+## `pdf_trio` for OA journal crawls
+
+TODO
+
+## `pdf_trio` for "included by heuristic", not in fatcat
+
+TODO
+
+## Live-ingest missing arxiv papers
+
+ ./fatcat_ingest.py --allow-non-oa --limit 10000 query arxiv_id:* > /srv/fatcat/snapshots/arxiv_10k_ingest_requests.json
+ => Expecting 1505184 release objects in search queries
+
+ cat /srv/fatcat/snapshots/arxiv_10k_ingest_requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 22
+
+Repeating this every few days should (?) result in all the backlog of arxiv
+papers getting indexed. Could focus on recent years to start (with query
+filter).
+
+## re-ingest spn2 errors (all time)
+
+Eg:
+
+ spn2-cdx-lookup-failure: 143963
+ spn-error: 101773
+ spn2-error: 16342
+
+TODO
+
+## re-try CDX errors
+
+Eg, for unpaywall only, bulk ingest all `cdx-error`.
+
+TODO
+
+## live ingest unpaywall `no-capture` URLs
+
+After re-trying the CDX errors for unpaywall URLs (see above), count all the
+no-capture URLs, and if reasonable recrawl them all in live more ("reasonable"
+meaning fewer than 200k or so URLs).
+
+Could also force recrawl (not using CDX lookups) for some publisher platforms
+if that made sense.
+
+TODO
diff --git a/notes/tasks/2020-08-20_file_meta.md b/notes/tasks/2020-08-20_file_meta.md
new file mode 100644
index 0000000..39c84dd
--- /dev/null
+++ b/notes/tasks/2020-08-20_file_meta.md
@@ -0,0 +1,66 @@
+
+Want to update fatcat file entities with "full" file metadata for those which are missing it.
+
+How many `file_meta` rows *still* don't have metadata?
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+ => 62962
+
+First generate list of sha1hex from most recent bulk export which are missing
+at least some metadata (based on missing sha256):
+
+ zcat file_hashes.tsv.gz | rg '\t\t' | cut -f3 | sort -u -S 4G | pv -l > fatcat_file_partial_sha1hex.tsv
+ => 18.7M 0:05:46 [53.8k/s]
+
+Then dump the entire sandcrawler `file_meta` table as TSV, with first column
+sha1hex and second column JSON with all the file metadata fields:
+
+ COPY (
+ SELECT sha1hex, row_to_json(file_meta)
+ FROM file_meta
+ WHERE sha256hex IS NOT NULL
+ ORDER BY sha1hex ASC
+ )
+ TO '/grande/snapshots/file_meta_dump.tsv'
+ WITH NULL '';
+
+Join/cut:
+
+ export LC_ALL=C
+ join -t$'\t' fatcat_file_partial_sha1hex.tsv /grande/snapshots/file_meta_dump.tsv | uniq -w 40 | cut -f2 | pv -l > fatcat_file_partial.file_meta.json
+ => 18.1M 0:03:37 [83.2k/s]
+
+Check counts:
+
+ cat fatcat_file_partial.file_meta.json | jq .sha1hex -r | sort -u -S 4G | wc -l
+ => 18135313
+
+ zcat fatcat_file_partial.file_meta.json.gz | jq .mimetype -r | sort -S 4G | uniq -c | sort -nr
+ 18103860 application/pdf
+ 29977 application/octet-stream
+ 876 text/html
+ 199 application/postscript
+ 171 application/gzip
+ 84 text/plain
+ 48 application/xml
+ 38 application/vnd.ms-powerpoint
+ 16 application/msword
+ 8 application/vnd.openxmlformats-officedocument.wordprocessingml.document
+ 6 image/jpeg
+ 4 message/rfc822
+ 4 application/zip
+ 4 application/vnd.openxmlformats-officedocument.presentationml.presentation
+ 3 text/x-tex
+ 3 application/x-dosexec
+ 2 application/x-tar
+ 2 application/vnd.ms-tnef
+ 1 video/mpeg
+ 1 image/tiff
+ 1 image/svg+xml
+ 1 image/png
+ 1 image/gif
+ 1 audio/x-ape
+ 1 application/vnd.ms-office
+ 1 application/CDFV2-unknown
+
+TODO: fatcat importer
diff --git a/notes/tasks/2020-10-21_pdfextract_holes.md b/notes/tasks/2020-10-21_pdfextract_holes.md
new file mode 100644
index 0000000..c0bb65e
--- /dev/null
+++ b/notes/tasks/2020-10-21_pdfextract_holes.md
@@ -0,0 +1,74 @@
+
+Realized I had not enabled persisting of PDF extraction results (thumbnail,
+text) in ingest worker when added over the summer. So now need to run a
+catch-up. This applied to both "live" and "bulk" ingest.
+
+## `cdx` / `ingest` / `grobid` catch-up
+
+First, re-run extraction for cases where we did an ingest, and grobid ran
+successfully, and we have a CDX row, but no `pdf_meta`:
+
+ -- this is a slow query
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE cdx.sha1hex IS NOT NULL
+ --AND fatcat_file.sha1hex IS NOT NULL
+ AND ingest_file_result.terminal_sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+ )
+ TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json'
+ WITH NULL '';
+ => 19,676,116
+
+Wow, that is a lot. Many from recent OAI-PMH and OA crawls, presumably.
+
+ cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+And again, after a couple partitions got hung up:
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE cdx.sha1hex IS NOT NULL
+ --AND fatcat_file.sha1hex IS NOT NULL
+ AND ingest_file_result.terminal_sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+ )
+ TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json'
+ WITH NULL '';
+
+
+ cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+ => 562k 0:00:16 [34.6k/s]
+
+## `petabox` / `grobid` catch-up
+
+These didn't all seem to extract correctly before after 1.5m rows, there will
+still 900k unprocessed. Trying again.
+
+ COPY (
+ SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox)
+ FROM grobid
+ LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE petabox.sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+ )
+ TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-11-04.json'
+ WITH NULL '';
+
+ cat /grande/snapshots/dump_unextracted_pdf_petabox.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+## `cdx` / `grobid` catch-up
+
+Next will be to process PDFs with GROBID and CDX but no ingest.
+
diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md
new file mode 100644
index 0000000..cd8176e
--- /dev/null
+++ b/notes/tasks/2021-09-09_pdf_url_lists.md
@@ -0,0 +1,70 @@
+
+Want to dump a URL list to share with partners, filtered to content we think is
+likely to be scholarly.
+
+Columns to include:
+
+- original URL
+- capture timestamp
+- SHA1
+
+## Stats Overview
+
+file_meta table, mimetype=application/pdf: 173,816,433
+
+cdx table, mimetype=application/pdf: 131,346,703
+
+ingest_file_result table, pdf, success: 66,487,928
+
+## Ingested PDF URLs
+
+"Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also?
+
+ COPY (
+ SELECT
+ base_url as start_url,
+ terminal_url as pdf_url,
+ terminal_dt as pdf_url_timestamp,
+ terminal_sha1hex as pdf_sha1hex
+ FROM ingest_file_result
+ WHERE
+ ingest_type = 'pdf'
+ AND status = 'success'
+ )
+ TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv'
+ WITH NULL '';
+ => 77,892,849
+
+## CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+ COPY (
+ SELECT
+ cdx.url as pdf_url,
+ cdx.datetime as pdf_url_timestamp,
+ cdx.sha1hex as pdf_sha1hex
+ FROM cdx
+ LEFT JOIN file_meta
+ ON
+ cdx.sha1hex = file_meta.sha1hex
+ WHERE
+ file_meta.mimetype = 'application/pdf'
+ OR (
+ file_meta.mimetype IS NULL
+ AND cdx.mimetype = 'application/pdf'
+ )
+ )
+ TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv'
+ WITH NULL '';
+ => 147,837,935
+
+## Processed web PDFs
+
+"Parsed web PDFs": `file_meta`, left join CDX
+
+(didn't do this one)
+
+---
+
+Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09>
diff --git a/notes/tasks/2021-10-29_crossref_refs_backfill.md b/notes/tasks/2021-10-29_crossref_refs_backfill.md
new file mode 100644
index 0000000..94eefec
--- /dev/null
+++ b/notes/tasks/2021-10-29_crossref_refs_backfill.md
@@ -0,0 +1,235 @@
+
+The current sandcrawler-db crossref table was backfilled from a 2021-01
+snapshot, and has not been updated since.
+
+Would like to use the existing fatcat Kafka feed to keep the crossref table up
+to date, and also backfill in GROBID reference parsing of all `unstructured`
+references.
+
+Current plan is:
+
+1. use kafkacat CLI to dump crossref Kafka topic, from the begining of 2021 up
+ to some recent date
+2. use `persist_tool.py`, with a large batch size (200?) to backfill this dump
+ into sandcrawler-db. this will update some rows multiple times (if there
+ have been updates)
+3. dump the full crossref table, as a point-in-time snapshot
+4. filter to crossref records that have `unstrutured` references in them (at
+ all)
+5. use `grobid_tool.py` with `parallel` to batch process references
+6. backfill these refs using a simple SQL COPY statement
+7. deploy crossref persist worker, with ref updates on, and roll the consumer
+ group back to date of dump
+8. wait for everything to catch up
+
+
+## Commands
+
+Get a timestamp in milliseconds:
+
+ 2021-01-01 is:
+ 1609488000 in unix time (seconds)
+ 1609488000000 in miliseconds
+
+Hrm, oldest messages seem to actually be from 2021-04-28T19:21:10Z though. Due
+to topic compaction? Yup, we have a 180 day compaction policy on that topic,
+probably from when kafka space was tight. Oh well!
+
+Updated retention for this topic to `46656000000` (~540 days, ~18 months) using
+`kafka-manager` web app.
+
+ kafkacat -C -b wbgrp-svc263.us.archive.org -t fatcat-prod.api-crossref -o s@1609488000000 \
+ | pv -l \
+ | gzip \
+ > crossref_feed_start20210428_end20211029.json.gz
+
+This resulted in ~36 million rows, 46GB.
+
+`scp` that around, then run persist on `sandcrawler-db`:
+
+ # in pipenv, as sandcrawler user
+ # manually edited to set batch size to 200
+ zcat /srv/sandcrawler/tasks/crossref_feed_start20210428_end20211029.json.gz \
+ | pv -l \
+ | ./persist_tool.py crossref -
+ => 36.8M 11:02:43 [ 925 /s]
+
+With a single thread, the persist process runs at about 1,000 rows/sec, which
+works out to about 10 hours for 36 million rows.
+
+At the start of this process, total PostgreSQL database size is 832.21G. At the
+end, 902.51G. Have not run a `VACUUM ALL` or anything like that.
+
+Query to dump crossref rows which have any refs and compress output with pigz:
+
+ # dump_crossref.sql
+ COPY (
+ SELECT record
+ FROM crossref
+ WHERE record::jsonb @? '$.reference[*].unstructured'
+ -- LIMIT 5
+ )
+ TO STDOUT
+ WITH NULL '';
+
+ # 'sed' required because of double quote escaping in postgresql output::
+ # https://stackoverflow.com/questions/29869983/postgres-row-to-json-produces-invalid-json-with-double-escaped-quotes/29871069
+ # 'rg' filter is just being conservative
+
+ # XXX: next time add to the pipeline: rg -v "\\\\"
+ # or, find some way to filter/transform this kind of SQL export better?
+ psql sandcrawler < dump_crossref.sql \
+ | sed 's/\\"/\"/g' \
+ | rg '^\{' \
+ | pv -l \
+ | pigz \
+ > /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.json.gz
+ => 26.1M 3:22:51 [2.15k/s]
+
+ # NOTE: -j40 is for production run with ~dedicated GROBID server with many cores
+ zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.json.gz \
+ | rg -v "\\\\" \
+ | parallel -j35 --linebuffer --round-robin --pipe ./grobid_tool.py --grobid-host http://wbgrp-svc096.us.archive.org:8070 parse-crossref-refs - \
+ | pv -l \
+ | pigz \
+ > /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz
+
+ # from earlier testing with -j40: able to do about 300-500 records/second
+ # 23.9k 0:01:14 [ 320 /s]
+ # 134518 total refs parsed
+ # ~1817 refs/second parsed
+
+ # with errors, got through about: 2.08M 1:38:20 [ 352 /s]
+ # was still seing bad JSON?
+ # JSON lines pushed: Counter({'total': 105898, 'pushed': 105886, 'error-json-decode': 12})
+
+ # finally, without errors:
+ # 18.6M 8:35:02 [ 603 /s]
+
+In the next step, going to need a small direct persist worker to copy lines
+verbatim into just the `grobid_refs` table.
+
+## Errors
+
+Got errors when running for real:
+
+ xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 114, column 33
+
+ requests.exceptions.HTTPError: 500 Server Error: Internal Server Error for url: http://wbgrp-svc096.us.archive.org:8070/api/processCitationList
+
+ urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='wbgrp-svc096.us.archive.org', port=8070): Max retries exceeded with url: /api/processCitationList (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f54b0a3bd00>: Failed to establish a new connection: [Errno 99] Cannot assign requested address'))
+
+
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ERROR [2021-11-03 06:57:32,569] org.grobid.service.process.GrobidRestProcessString: An unexpected exception occurs.
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! java.lang.NullPointerException: null
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.data.BiblioItem.cleanTitles(BiblioItem.java:1784)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.CitationParser.processingLayoutTokenMultiple(CitationParser.java:175)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.CitationParser.processingStringMultiple(CitationParser.java:92)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.Engine.processRawReferences(Engine.java:168)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.service.process.GrobidRestProcessString.processCitationList(GrobidRestProcessString.java:316)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.service.GrobidRestService.processCitationListReturnXml_post(GrobidRestService.java:581)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at sun.reflect.GeneratedMethodAccessor19.invoke(Unknown Source)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
+ Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at java.lang.reflect.Method.invoke(Method.java:498)
+ [...]
+
+Bogus example reference causing 500 error (among other non-error citations) (doi:10.5817/cz.muni.m210-9541-2019):
+
+ 'Müller, R., Šidák, P. (2012). Slovník novější literární teorie. Praha: Academia.'
+ '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0'
+ 'Šotkovská, J. (2008). Rané divadelní hry Milana Uhdeho; diplomová práce. Brno: Masarykova univerzita.',
+
+s.strip() in python would remove these non-breaking spaces (update: implemented this later)
+
+ Maheswari, S., Vijayalakshmi, C.: Optimization Model for Electricity Distribution System Control using Communication System by La-grangian Relaxation Technique. CiiT International Journal of Wireless Communication 3(3), 183–187 (2011) (Print: ISSN 0974 – 9756 & Online: ISSN 0974 – 9640)
+
+Also:
+
+ truncating very large reference list for doi:10.1017/chol9780521264303.033 len:2281
+ truncating very large reference list for doi:10.1017/chol9780521263351.011 len:3129
+ truncating very large reference list for doi:10.1017/chol9780521263351.022 len:2968
+ truncating very large reference list for doi:10.1017/chol9780521264303.036 len:2221
+ truncating very large reference list for doi:10.1017/chol9780521264303.007 len:2238
+ truncating very large reference list for doi:10.1017/chol9780521086912.001 len:2177
+ truncating very large reference list for doi:10.1017/chol9780521228046.002 len:2133
+ truncating very large reference list for doi:10.1017/chol9780521264303.035 len:2221
+ truncating very large reference list for doi:10.1017/chol9780521264303.002 len:2279
+
+Seems like bumping to 2500 as the maximum reference list size might be
+reasonable (it is 2000 currently).
+
+After some refactoring, still getting:
+
+ requests.exceptions.ConnectionError
+
+This is because I am doing POST without a session.
+
+Then, still got requests.exceptions.ReadTimeout
+
+Finally, got through the whole batch, (`18.6M 8:35:02 [ 603 /s]` output), with
+only a few dozen rows like:
+
+ GROBID returned bad XML for Crossref DOI: 10.1007/978-3-030-03008-7_21-1
+ GROBID HTTP timeout for Crossref DOI: 10.1007/978-1-4757-1496-8_3
+ GROBID HTTP timeout for Crossref DOI: 10.1007/978-1-4757-1493-7_3
+ GROBID returned bad XML for Crossref DOI: 10.1007/978-3-319-96184-2_2
+ GROBID returned bad XML for Crossref DOI: 10.1063/1.5031970
+ truncating very large reference list for doi:10.1007/978-1-4757-1499-9_15 len:11401
+ GROBID returned bad XML for Crossref DOI: 10.1016/j.oraloncology.2019.104562
+ GROBID returned bad XML for Crossref DOI: 10.1016/j.pec.2020.04.010
+
+So things seem to be working!
+
+Summary lines looked like:
+
+ JSON lines pushed: Counter({'total': 531487, 'pushed': 531487})
+ Worker: Counter({'total': 536541, 'failed': 3})
+
+Failures per batch were on the order of 0 to 3.
+
+## Postgres Backfill
+
+Start with a sample:
+
+ zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz \
+ | head -n1000 \
+ | ./persist_tool.py grobid-refs -
+ # Worker: Counter({'total': 1000, 'insert-grobid_refs': 1000, 'update-grobid_refs': 0})
+
+ # same command again:
+ # Worker: Counter({'total': 1000, 'update-grobid_refs': 1000, 'insert-grobid_refs': 0})
+
+Example DOIs:
+
+ # no refs
+ 10.1007/978-1-349-04135-0_3
+ http get :3030/crossref_with_refs "doi==eq.10.1007/978-1-349-04135-0_3"
+
+ # with refs
+ 10.1007/978-1-349-03594-6_2
+ http get :3030/crossref_with_refs "doi==eq.10.1007/978-1-349-03594-6_2"
+
+Seems to be working, so will do the full backfill. Can check table sizes on a
+per-table basis when complete.
+
+ zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz \
+ | pv -l \
+ | ./persist_tool.py grobid-refs -
+ # Worker: Counter({'total': 18646668, 'insert-grobid_refs': 18639195, 'update-grobid_refs': 7473})
+
+
+## Kafka Setup
+
+Added ansible config and deployed persist-crossref worker.
+
+First roll-back just a couple days as a test:
+
+ ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-crossref --reset-offsets --topic fatcat-prod.api-crossref --to-datetime 2021-11-07T00:00:00.000
+
+ # eg: Import counts: Counter({'total': 372350, 'insert-grobid_refs': 326987, 'update-crossref': 265581, 'insert-crossref': 106769, 'update-grobid_refs': 45362, 'skip': 1})
+
+Then roll-back to before the snapshot and backfill, to catch up:
+
+ ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-crossref --reset-offsets --topic fatcat-prod.api-crossref --to-datetime 2021-10-26T00:00:00.000
+
+Ran this last command on 2021-11-10, and total lag was around 2,566,741.
diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md
new file mode 100644
index 0000000..5fb69d1
--- /dev/null
+++ b/notes/tasks/2021-12-06_regrobid.md
@@ -0,0 +1,380 @@
+
+Want to test recent updates of GROBID (to fix regex issue), and also re-process
+a number of PDFs which failed to process with GROBID initially.
+
+
+## HTTP 503
+
+These are attempts which failed because GROBID was too busy or not running.
+
+ # IMPROVED BELOW
+ COPY (
+ SELECT row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ WHERE
+ grobid.status_code = 503
+ AND cdx.sha1hex IS NOT NULL
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
+ WITH NULL '';
+ # COPY 4749
+
+Not actually that many, which seems good. Confirm that these are uniq by sha1hex:
+
+ cat ungrobided_fatcat.2021-12-06.grobid503.json | jq .sha1hex -r | sort | uniq -d | wc -l
+ # 302
+
+Nope! Need to add "distinct on":
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ WHERE
+ grobid.status_code = 503
+ AND cdx.sha1hex IS NOT NULL
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
+ WITH NULL '';
+ # COPY 4297
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+## Never Processed CDX
+
+PDFs in fatcat which have never been processed with GROBID.
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM fatcat_file
+ LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
+ LEFT JOIN grobid ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
+ WHERE
+ grobid.sha1hex IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json'
+ WITH NULL '';
+ # COPY 15488
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+
+PDFs in fatcat which have never been processed with pdfextract.
+
+ # TODO
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM fatcat_file
+ LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
+ LEFT JOIN pdf_meta ON pdf_meta.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
+ WHERE
+ pdf_meta.sha1hex IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND cdx.mimetype = 'application/pdf'
+ AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json'
+ WITH NULL '';
+ # COPY 45535
+
+ cat /srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+ # 45.5k 0:00:01 [30.2k/s]
+
+## Timeout or Failure
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
+ WHERE
+ (grobid.status_code = 500 OR grobid.status_code = -4)
+ AND cdx.sha1hex IS NOT NULL
+ AND file_meta.mimetype = 'application/pdf'
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json'
+ WITH NULL '';
+ # COPY 8,084,296
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+This seems to not be working very well, mostly errors, empty docs, etc. Will
+roll-forward the kafka consumer group after attempting a couple hundred
+thousand of these.
+
+Let's try limiting to files actually in fatcat:
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE
+ (grobid.status_code = 500 OR grobid.status_code = -4)
+ AND cdx.sha1hex IS NOT NULL
+ AND fatcat_file.sha1hex IS NOT NULL
+ AND file_meta.mimetype = 'application/pdf'
+ -- sort of arbitary "not recently" date filter
+ AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json'
+ WITH NULL '';
+ # COPY 529265
+
+That is a much more managable batch to retry.
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+ # 529k 0:00:17 [31.0k/s]
+
+
+## Missing Fatcat Files
+
+There were around a half million fatcat file entities which didn't have `cdx`
+rows in sandcrawler. Did some specific pdfextract processing; now we should do
+GROBID ingest as well.
+
+Enque the `CDX` objects for GROBID and pdfextract processing:
+
+ zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+ # 354k 0:00:11 [30.6k/s]
+
+ zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+And some earlier files of interest on `aitio`:
+
+ cat files_missing_sha256.ingest_results.json \
+ | rg '"application/pdf"' \
+ | rg -v "\\\\" \
+ | jq .cdx -c \
+ | sort -u -S 4G \
+ | pv -l \
+ > files_missing_sha256.cdx.uniq.json
+ # 100k 0:00:47 [2.09k/s]
+
+ cat files_missing_sha256.cdx.uniq.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+ cat files_missing_sha256.cdx.uniq.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+
+## Ancient Fatcat Files
+
+Files from an era where we didn't record GROBID version or status, even for
+success.
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE
+ grobid.status_code = 200
+ AND grobid.status IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND fatcat_file.sha1hex IS NOT NULL
+ -- sort of arbitary "not recently" date filter
+ AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json'
+ WITH NULL '';
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+ # 107k 0:00:03 [29.9k/s]
+
+
+## Start Re-Processing Old GROBID Versions
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE
+ grobid.status = 'success'
+ AND grobid.grobid_version NOT LIKE '0.7.%'
+ AND cdx.sha1hex IS NOT NULL
+ AND fatcat_file.sha1hex IS NOT NULL
+ -- sort of arbitary "not recently" date filter
+ AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json'
+ WITH NULL '';
+
+This one is huge, and want to process in batches/chunks of ~8 million at a time.
+
+ cd /srv/sandcrawler/tasks/
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \
+ | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json
+
+Submit individual batches like:
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Overall progress:
+
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_00.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_01.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_02.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_03.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_04.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_05.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_06.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_07.json
+ x ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small)
+
+This finally finished on 2022-04-26. Horray!
+
+## General Counts
+
+How many fatcat files of what mimetype (reported in sandcrawler-db)?
+
+ SELECT file_meta.mimetype, COUNT(*)
+ FROM fatcat_file
+ LEFT JOIN file_meta ON fatcat_file.sha1hex = file_meta.sha1hex
+ WHERE
+ fatcat_file.first_release_ident IS NOT NULL
+ AND fatcat_file.any_url = true
+ AND content_scope IS NULL
+ GROUP BY file_meta.mimetype
+ ORDER BY COUNT(*) DESC
+ LIMIT 25;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+----------
+ application/pdf | 45227033
+ | 433068
+ application/octet-stream | 30634
+ application/jats+xml | 6874
+ text/html | 876
+ application/postscript | 199
+ application/gzip | 173
+ text/plain | 84
+ application/xml | 48
+ application/vnd.ms-powerpoint | 38
+ application/msword | 16
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 8
+ image/jpeg | 6
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 4
+ message/rfc822 | 4
+ application/zip | 4
+ text/x-tex | 3
+ application/x-dosexec | 3
+ application/x-tar | 2
+ application/vnd.ms-tnef | 2
+ image/svg+xml | 1
+ image/tiff | 1
+ image/png | 1
+ image/gif | 1
+ application/vnd.ms-office | 1
+ (25 rows)
+
+
+PDF extract status?
+
+ SELECT pdf_meta.status, COUNT(*)
+ FROM fatcat_file
+ LEFT JOIN pdf_meta ON fatcat_file.sha1hex = pdf_meta.sha1hex
+ WHERE
+ fatcat_file.first_release_ident IS NOT NULL
+ AND fatcat_file.any_url = true
+ AND content_scope IS NULL
+ GROUP BY pdf_meta.status
+ ORDER BY COUNT(*) DESC
+ LIMIT 25;
+
+ status | count
+ ----------------+----------
+ success | 43415920
+ | 2018522
+ text-too-large | 122730
+ parse-error | 94876
+ not-pdf | 32156
+ error-wayback | 14504
+ bad-unicode | 279
+ bad-pdf | 98
+ empty-blob | 2
+ (9 rows)
+
+
+What are the GROBID status codes for fatcat files? Narrowed down:
+
+ SELECT grobid.status, grobid.status_code, COUNT(*)
+ FROM fatcat_file
+ LEFT JOIN grobid ON fatcat_file.sha1hex = grobid.sha1hex
+ WHERE
+ fatcat_file.first_release_ident IS NOT NULL
+ AND fatcat_file.any_url = true
+ AND content_scope IS NULL
+ GROUP BY grobid.status, grobid.status_code
+ ORDER BY COUNT(*) DESC
+ LIMIT 25;
+
+ status | status_code | count
+ ----------------+-------------+----------
+ success | 200 | 44409069
+ error | 500 | 580402
+ | | 468836
+ | 200 | 240660
+ error-timeout | -4 | 79
+ bad-grobid-xml | 200 | 38
+ error | 200 | 3
+ (7 rows)
+
+Ran the same query again on 2021-12-15:
+
+ status | status_code | count
+ ----------------+-------------+----------
+ success | 200 | 45092915
+ error | 500 | 302373
+ | | 250335
+ | 200 | 53352
+ bad-grobid-xml | 200 | 39
+ error-timeout | -4 | 37
+ error | 200 | 34
+ error | 503 | 2
+ (8 rows)
diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
new file mode 100644
index 0000000..b5422c2
--- /dev/null
+++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
@@ -0,0 +1,23 @@
+
+Martin crawled more than 10 million new PDFs from various platform domains. We
+should get these processed and included in sandcrawler-db.
+
+## Select CDX Rows
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM cdx
+ LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex
+ WHERE
+ grobid.sha1hex IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%'
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json'
+ WITH NULL '';
+ => COPY 8801527
+
+ cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+ # for pdfextract, would be: sandcrawler-prod.unextracted
diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md
new file mode 100644
index 0000000..c727a57
--- /dev/null
+++ b/notes/tasks/2022-03-07_ukraine_firedrill.md
@@ -0,0 +1,225 @@
+
+Want to do priority crawling of Ukranian web content, plus Russia and Belarus.
+
+
+## What is Missing?
+
+ (country_code:ua OR lang:uk)
+ => 2022-03-08, before ingests: 470,986 total, 170,987 missing, almost all article-journal, peak in 2019, 55k explicitly OA
+ later in day, already some 22k missing found! wow
+ => 2022-04-04, after ingests: 476,174 total, 131,063 missing, 49k OA missing
+
+## Metadata Prep
+
+- container metadata update (no code changes)
+ x wikidata SPARQL update
+ x chocula run
+ x journal metadata update (fatcat)
+ x update journal stats (fatcat extra)
+- DOAJ article metadata import
+ x prep and upload single JSON file
+
+
+## Journal Homepage URL Crawl
+
+x dump ukraine-related journal homepages from chocula DB
+x create crawl config
+x start crawl
+x repeat for belarus and russia
+
+
+ python3 -m chocula export_urls > homepage_urls.2022-03-08.tsv
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ua/' | sort -u > homepage_urls.2022-03-08.ua_tld.tsv
+ wc -l homepage_urls.2022-03-08.ua_tld.tsv
+ 1550 homepage_urls.2022-03-08.ua_tld.tsv
+
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.by/' | sort -u > homepage_urls.2022-03-08.by_tld.tsv
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ru/' | sort -u > homepage_urls.2022-03-08.ru_tld.tsv
+
+sqlite3:
+
+ select count(*) from journal where country = 'ua' or lang = 'uk' or name like '%ukrain%' or publi
+ 1952
+
+ SELECT COUNT(*) FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ua'
+ OR journal.lang = 'uk'
+ OR journal.name like '%ukrain%'
+ OR journal.publisher like '%ukrain%';
+ => 1970
+
+ .mode csv
+ .once homepage_urls_ukraine.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ua'
+ OR journal.lang = 'uk'
+ OR journal.name like '%ukrain%'
+ OR journal.publisher like '%ukrain%';
+
+ .mode csv
+ .once homepage_urls_russia.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ru'
+ OR journal.lang = 'ru'
+ OR journal.name like '%russ%'
+ OR journal.publisher like '%russ%';
+
+ .mode csv
+ .once homepage_urls_belarus.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'by'
+ OR journal.lang = 'be'
+ OR journal.name like '%belarus%'
+ OR journal.publisher like '%belarus%';
+
+ cat homepage_urls_ukraine.tsv homepage_urls.2022-03-08.ua_tld.tsv | sort -u > homepage_urls_ukraine_combined.2022-03-08.tsv
+
+ wc -l homepage_urls.2022-03-08.ua_tld.tsv homepage_urls_ukraine.tsv homepage_urls_ukraine_combined.2022-03-08.tsv
+ 1550 homepage_urls.2022-03-08.ua_tld.tsv
+ 1971 homepage_urls_ukraine.tsv
+ 3482 homepage_urls_ukraine_combined.2022-03-08.tsv
+
+ cat homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv | sort -u > homepage_urls_russia_combined.2022-03-08.tsv
+
+ wc -l homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv homepage_urls_russia_combined.2022-03-08.tsv
+ 3728 homepage_urls_russia.tsv
+ 2420 homepage_urls.2022-03-08.ru_tld.tsv
+ 6030 homepage_urls_russia_combined.2022-03-08.tsv
+
+
+ cat homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv | sort -u > homepage_urls_belarus_combined.2022-03-08.tsv
+
+ wc -l homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv homepage_urls_belarus_combined.2022-03-08.tsv
+ 138 homepage_urls_belarus.tsv
+ 85 homepage_urls.2022-03-08.by_tld.tsv
+ 222 homepage_urls_belarus_combined.2022-03-08.tsv
+
+
+## Landing Page Crawl
+
+x create crawl config
+x fatcat ingest query for related URLs
+ => special request code/label?
+x finish .by and .ru article URL dump, start crawling
+x URL list filtered from new OAI-PMH feed
+ => do we need to do full bulk load/dump, or not?
+- URL list from partner (google)
+- do we need to do alternative thing of iterating over containers, ingesting each?
+
+ ./fatcat_ingest.py --env prod \
+ --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-bulk \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ua OR lang:uk"
+
+ # around Tue 08 Mar 2022 01:07:37 PM PST
+ # Expecting 185659 release objects in search queries
+ # didn't complete successfully? hrm
+
+ # ok, retry "manually" (with kafkacat)
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ua OR lang:uk" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json
+ # Counter({'elasticsearch_release': 172881, 'estimate': 172881, 'ingest_request': 103318})
+ # 103k 0:25:04 [68.7 /s]
+
+ zcat /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_ua_pdfs.2022-03-08.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ua_pdfs.2022-03-08.txt.gz
+ # 103k 0:00:02 [38.1k/s]
+
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:by OR lang:be" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz
+ # Expecting 2266 release objects in search queries
+ # 1.29k 0:00:34 [37.5 /s]
+
+ zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_by_pdfs.2022-03-09.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_by_pdfs.2022-03-09.txt.gz
+
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ru OR lang:ru" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.json.gz
+ # Expecting 1515246 release objects in search queries
+
+ zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_ru_pdfs.2022-03-09.requests.partial.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ru_pdfs.2022-03-09.txt.gz
+
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ua/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ua_tld.txt
+ # 309k 0:00:03 [81.0k/s]
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.by/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.by_tld.txt
+ # 71.2k 0:00:03 [19.0k/s]
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt
+ # 276k 0:00:03 [72.9k/s]
+
+
+### Landing Page Bulk Ingest
+
+Running these 2022-03-24, after targeted crawl completed:
+
+ zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 103k 0:00:02 [36.1k/s]
+
+ zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 1.29k 0:00:00 [15.8k/s]
+
+ zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 546k 0:00:13 [40.6k/s]
+
+It will probably take a week or more for these to complete.
+
+
+## Outreach
+
+- openalex
+- sucho.org
+- ceeol.com
diff --git a/notes/tasks/2022-04-27_pdf_url_lists.md b/notes/tasks/2022-04-27_pdf_url_lists.md
new file mode 100644
index 0000000..273ff32
--- /dev/null
+++ b/notes/tasks/2022-04-27_pdf_url_lists.md
@@ -0,0 +1,72 @@
+
+Another dump of PDF URLs for partners. This time want to provide TSV with full
+wayback download URLs, as well as "access" URLs.
+
+ export TASKDATE=2022-04-27
+
+## "Ingested", AKA, "Targetted" PDF URLs
+
+These are URLs where we did a successful ingest run.
+
+ COPY (
+ SELECT
+ terminal_sha1hex as pdf_sha1hex,
+ ('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url,
+ ('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url
+ FROM ingest_file_result
+ WHERE
+ ingest_type = 'pdf'
+ AND status = 'success'
+ AND hit = true
+ ORDER BY terminal_sha1hex ASC
+ -- LIMIT 10;
+ )
+ TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv'
+ WITH NULL '';
+ => COPY 85712674
+
+May contain duplicates, both by sha1hex, URL, or both.
+
+Note that this could be filtered by timestamp, to make it monthly/annual.
+
+
+## All CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+ COPY (
+ SELECT
+ cdx.sha1hex as pdf_sha1hex,
+ ('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url,
+ ('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url
+ FROM cdx
+ LEFT JOIN file_meta
+ ON
+ cdx.sha1hex = file_meta.sha1hex
+ WHERE
+ file_meta.mimetype = 'application/pdf'
+ OR (
+ file_meta.mimetype IS NULL
+ AND cdx.mimetype = 'application/pdf'
+ )
+ ORDER BY cdx.sha1hex ASC
+ -- LIMIT 10;
+ )
+ TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv'
+ WITH NULL '';
+ => COPY 161504070
+
+Should be unique by wayback URL; may contain near-duplicates or duplicates by
+
+## Upload to archive.org
+
+TODO: next time compress these files first (gzip/pigz)
+
+ia upload ia_scholarly_urls_$TASKDATE \
+ -m collection:ia_biblio_metadata \
+ -m title:"IA Scholarly URLs ($TASKDATE)" \
+ -m date:$TASKDATE \
+ -m creator:"Internet Archive Web Group" \
+ -m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \
+ /srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv
+
diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
new file mode 100644
index 0000000..74d3857
--- /dev/null
+++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
@@ -0,0 +1,132 @@
+
+Had a huge number of SPN requests for the andrzejklimczuk.com domain,
+presumably from the author.
+
+Many were duplicates (same file, multiple releases, often things like zenodo
+duplication). Many were also GROBID 500s, due to truncated common crawl
+captures.
+
+Needed to cleanup! Basically sorted through a few editgroups manually, then
+rejected all the rest and manually re-submitted with the below queries and
+commands:
+
+ SELECT COUNT(*) from ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%';
+ => 589
+
+ SELECT ingest_file_result.status, COUNT(*) from ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ GROUP BY ingest_file_result.status;
+
+ status | count
+ ----------------+-------
+ cdx-error | 1
+ success | 587
+ wrong-mimetype | 1
+ (3 rows)
+
+
+ SELECT grobid.status_code, COUNT(*) from ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ GROUP BY grobid.status_code;
+
+ status_code | count
+ -------------+-------
+ 200 | 385
+ 500 | 202
+ | 2
+ (3 rows)
+
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ AND ingest_file_result.status = 'success'
+ AND grobid.status_code = 500
+ ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json';
+ => COPY 202
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid ON
+ grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ WHERE
+ ingest_request.link_source = 'spn'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+ AND ingest_file_result.status = 'success'
+ AND grobid.status_code = 200
+ ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json';
+ => COPY 385
+
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \
+ > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json
+
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+ | jq '. + {force_recrawl: true}' -c \
+ > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \
+ | shuf \
+ | head -n60000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+ | shuf \
+ | head -n100 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+ | shuf \
+ | head -n10000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+ > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \
+ | shuf \
+ | head -n60000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
diff --git a/pig/filter-cdx-pdfs.pig b/pig/filter-cdx-pdfs.pig
new file mode 100644
index 0000000..a2882ac
--- /dev/null
+++ b/pig/filter-cdx-pdfs.pig
@@ -0,0 +1,24 @@
+
+-- Tries to filter down a large CDX file (GWB index) to a subset of PDFs, by mimetype.
+--
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: May 2018
+
+%default INPUT ''
+%default OUTPUT ''
+
+set mapreduce.job.queuename default
+
+cdx = LOAD '$INPUT' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as url, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, cdxline;
+cdx = FILTER cdx BY not url matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY mimetype matches '.*pdf.*';
+cdx = ORDER cdx by url, timestamp PARALLEL 50;
+cdx = FOREACH cdx GENERATE cdxline;
+STORE cdx INTO '$OUTPUT' USING PigStorage(' ');
+
diff --git a/pig/filter-cdx-ps.pig b/pig/filter-cdx-ps.pig
index 6e80acc..b27a547 100644
--- a/pig/filter-cdx-ps.pig
+++ b/pig/filter-cdx-ps.pig
@@ -1,3 +1,9 @@
+-- Tries to filter down a large CDX file (GWB index) to a subset of postscript
+-- files, by mimetype.
+--
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: May 2018
+
%default INPUT ''
%default OUTPUT ''
diff --git a/pig/filter-cdx-source-code-crude.pig b/pig/filter-cdx-source-code-crude.pig
new file mode 100644
index 0000000..589aebd
--- /dev/null
+++ b/pig/filter-cdx-source-code-crude.pig
@@ -0,0 +1,40 @@
+
+-- Tries to filter down a large CDX file (GWB index) to a subset of source code
+-- files by mimetype and file extension.
+-- This is pretty crude and requires the URL to end with the file extension.
+---
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: October 2019
+
+
+%default INPUT ''
+%default OUTPUT ''
+
+set mapreduce.job.queuename default
+
+cdx = LOAD '$INPUT' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline;
+cdx = FILTER cdx BY not surt matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY mimetype matches '.*text.*';
+
+-- This is the core regex
+cdx = FILTER cdx
+
+ -- file suffix
+ BY surt matches '.*\\).*\\.(c|h|py|java)';
+
+-- DISTINCT by sha1 column
+cdx_uniq = FOREACH (GROUP cdx BY sha1sum) {
+ r = TOP(1, 0, $1);
+ GENERATE FLATTEN(r);
+};
+
+cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50;
+cdx_uniq = FOREACH cdx_uniq GENERATE cdxline;
+STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' ');
+
diff --git a/pig/filter-cdx-tarball.pig b/pig/filter-cdx-tarball.pig
new file mode 100644
index 0000000..d0be0f7
--- /dev/null
+++ b/pig/filter-cdx-tarball.pig
@@ -0,0 +1,38 @@
+
+-- Tries to filter down a large CDX file (GWB index) to a subset of tarballs
+-- (.tar.gz). Intention is to find software code that isn't in, eg, git.
+--
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: May 2018
+
+
+%default INPUT ''
+%default OUTPUT ''
+
+set mapreduce.job.queuename default
+
+cdx = LOAD '$INPUT' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1sum, cdxline;
+cdx = FILTER cdx BY not surt matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY mimetype matches '.*(octet|gzip|gtar|tgz).*';
+
+-- This is the core regex
+cdx = FILTER cdx
+ -- .tar.gz in URL
+ BY surt matches '(?i).+\\).*\\.tar\\.gz.*';
+
+-- DISTINCT by sha1 column
+cdx_uniq = FOREACH (GROUP cdx BY sha1sum) {
+ r = TOP(1, 0, $1);
+ GENERATE FLATTEN(r);
+};
+
+cdx_uniq = ORDER cdx_uniq by surt, timestamp PARALLEL 50;
+cdx_uniq = FOREACH cdx_uniq GENERATE cdxline;
+STORE cdx_uniq INTO '$OUTPUT' USING PigStorage(' ');
+
diff --git a/pig/join-cdx-sha1.pig b/pig/join-cdx-sha1.pig
new file mode 100644
index 0000000..86b9bb6
--- /dev/null
+++ b/pig/join-cdx-sha1.pig
@@ -0,0 +1,43 @@
+
+--
+-- Author: Bryan Newbold <bnewbold@archive.org>
+-- Date: December 2020
+--
+-- This pig script is intended to run agains the full (many TByte) GWB CDX, and
+-- catch captures that match exact SHA1 (b32 encoded), regardless of mimetype.
+--
+-- The process is to filter the CDX for non-revisit HTTP 200s, sort this by
+-- SHA1 digest, then join with the (pre-sorted) SHA1 -- b32 input list, and dump
+-- output.
+
+%default INPUT_CDX ''
+%default INPUT_DIGEST ''
+%default OUTPUT ''
+
+set mapreduce.job.queuename default
+
+digests = LOAD '$INPUT_DIGEST' AS sha1b32:chararray;
+digests = ORDER digests by sha1b32 ASC PARALLEL 20;
+digests = DISTINCT digests;
+
+cdx = LOAD '$INPUT_CDX' AS cdxline:chararray;
+cdx = FILTER cdx BY not STARTSWITH (cdxline, 'filedesc');
+cdx = FILTER cdx BY not STARTSWITH (cdxline, ' ');
+
+cdx = FOREACH cdx GENERATE STRSPLIT(cdxline,'\\s+') as cols, cdxline;
+cdx = FOREACH cdx GENERATE (chararray)cols.$0 as cdx_surt, (chararray)cols.$1 as timestamp, (chararray)cols.$3 as mimetype, (chararray)cols.$4 as httpstatus, (chararray)cols.$5 as sha1b32, cdxline;
+cdx = FILTER cdx BY not cdx_surt matches '-';
+cdx = FILTER cdx BY httpstatus matches '200';
+cdx = FILTER cdx BY not mimetype matches 'warc/revisit';
+cdx = ORDER cdx BY sha1b32 ASC PARALLEL 40;
+
+-- TODO: DISTINCT by (sha1b32, cdx_surt) for efficiency
+
+-- Core JOIN
+full_join = JOIN cdx BY sha1b32, digests BY sha1b32;
+
+-- TODO: at most, say 5 CDX lines per sha1b32?
+
+result = FOREACH full_join GENERATE cdxline;
+
+STORE result INTO '$OUTPUT' USING PigStorage();
diff --git a/pig/tests/files/example.sha1b32 b/pig/tests/files/example.sha1b32
new file mode 100644
index 0000000..20a1357
--- /dev/null
+++ b/pig/tests/files/example.sha1b32
@@ -0,0 +1,4 @@
+EJWYVOPONJRARK7SGG6COFRN7CSTHROY
+V32E3CCO7NMI2M4OHLKG73DXD72LR4B2
+3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ
+E3WSNQ7JAFOW7N3ZJ6GLV27T52T25JDK
diff --git a/pig/tests/files/sourcecode.cdx b/pig/tests/files/sourcecode.cdx
new file mode 100644
index 0000000..eeb397c
--- /dev/null
+++ b/pig/tests/files/sourcecode.cdx
@@ -0,0 +1,6 @@
+# match
+edu,cmu,cs,adm,reports-archive)/anon/usr0/ftp/usr0/anon/2002/cmu-cs-02-119.java 20170706005950 http://reports-archive.adm.cs.cmu.edu/anon/usr0/ftp/usr0/anon/2002/CMU-CS-02-119.java text/plain 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 361006 17120058 CITESEERX-CRAWL-2017-06-20-20170706004100259-00924-00932-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170706005946792-00926-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+# no
+fi,tkk,lib)/diss/2001/isbn951225459x/isbn951225459x.pyc 20170705074926 http://lib.tkk.fi/Diss/2001/isbn951225459X/isbn951225459X.pyc text/plain 200 KJBCOT7LGBNIAVGEGPUELK5OK6RTFORR - - 344175 255650124 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705074843696-00134-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+# no
+org,oxfordjournals,nar)/cgi/reprint/gkl1060v1.pdf 20170706035441 http://nar.oxfordjournals.org/cgi/reprint/gkl1060v1.pdf text/html 301 OX6MLVDFURLT2KSYCXUYW2PZNOVFSEVF - - 697 49346051 CITESEERX-CRAWL-2017-06-20-20170706034741172-00140-00149-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706035435634-00148-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
diff --git a/pig/tests/files/tarballs.cdx b/pig/tests/files/tarballs.cdx
new file mode 100644
index 0000000..7a81b79
--- /dev/null
+++ b/pig/tests/files/tarballs.cdx
@@ -0,0 +1,10 @@
+#http://research.fit.edu/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf
+#http://ijs.sgmjournals.org:80/cgi/reprint/54/6/2217.pdf
+#http://eprints.ecs.soton.ac.uk/12020/1/mind-the-semantic-gap.pdf
+#http://eprint.uq.edu.au/archive/00004120/01/R103_Forrester_pp.pdf
+
+# should match 2:
+
+edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.pdf 20170706005950 http://mit.edu/file.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3I - - 123 456 CRAWL/CRAWL.warc.gz
+edu,fit,research)/sealevelriselibrary/documents/doc_mgr/448/Florida_Keys_Low_Island_Biodiversity_&_SLR_-_Ross_et_al_2009.tar.gz 20170706005950 http://mit.edu/file.tar.gz application/octet-stream 200 NQHD36X5MNZPWFNMD5LFOYZSFGCHUN3I - - 123 456 CRAWL/CRAWL.warc.gz
+org,sgmjournals,ijs)//cgi/reprint/54/6/2217.tar.gz 20170706005950 http://mit.edu/file.tar.gz application/gzip 200 TQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 123 456 CRAWL/CRAWL.warc.gz
diff --git a/pig/tests/pighelper.py b/pig/tests/pighelper.py
index 4aa4259..95e0426 100644
--- a/pig/tests/pighelper.py
+++ b/pig/tests/pighelper.py
@@ -17,6 +17,9 @@ import unittest
import subprocess
+def count_lines(s):
+ return len([l for l in s.strip().split('\n') if len(l) > 0])
+
class PigTestHelper(unittest.TestCase):
@classmethod
@@ -50,7 +53,7 @@ class PigTestHelper(unittest.TestCase):
return retval
def run_pig(self, script_path, in_file, **kwargs):
- """Convenience helper around run_pig().
+ """Convenience helper around run_pig_raw().
INPUT parameter is set to in_file.
OUTPUT parameter is set to a random file.
diff --git a/pig/tests/test_filter_cdx_paper_pdfs.py b/pig/tests/test_filter_cdx_paper_pdfs.py
index a8ebd9f..c2d2e6b 100644
--- a/pig/tests/test_filter_cdx_paper_pdfs.py
+++ b/pig/tests/test_filter_cdx_paper_pdfs.py
@@ -1,10 +1,8 @@
import os
import unittest
-from pighelper import PigTestHelper
+from pighelper import PigTestHelper, count_lines
-def count_lines(s):
- return len([l for l in s.strip().split('\n') if len(l) > 0])
class TestFilterCDXPaperPdfs(PigTestHelper):
diff --git a/pig/tests/test_filter_software.py b/pig/tests/test_filter_software.py
new file mode 100644
index 0000000..cce90b4
--- /dev/null
+++ b/pig/tests/test_filter_software.py
@@ -0,0 +1,16 @@
+
+import os
+import unittest
+from pighelper import PigTestHelper, count_lines
+
+
+class TestFilterCDXSoftware(PigTestHelper):
+
+ def test_tarballs(self):
+ r = self.run_pig("filter-cdx-tarball.pig", "tests/files/tarballs.cdx")
+ assert count_lines(r) == 2
+
+ def test_source_code(self):
+ r = self.run_pig("filter-cdx-source-code-crude.pig", "tests/files/sourcecode.cdx")
+ assert count_lines(r) == 1
+
diff --git a/pig/tests/test_join_cdx.py b/pig/tests/test_join_cdx.py
new file mode 100644
index 0000000..e6eca6a
--- /dev/null
+++ b/pig/tests/test_join_cdx.py
@@ -0,0 +1,44 @@
+
+import os
+import unittest
+import tempfile
+import subprocess
+from pighelper import PigTestHelper, count_lines
+
+class TestJoinCDXSha1(PigTestHelper):
+
+ def run_pig_join(self, script_path, cdx_file, digest_file, **kwargs):
+ """Convenience helper around run_pig().
+
+ INPUT parameter is set to in_file.
+ OUTPUT parameter is set to a random file.
+ Any keyword args are passed as parameters.
+ """
+
+ pargs = []
+ for key, value in kwargs.items():
+ pargs.append('-p')
+ pargs.append('{}={}'.format(key, value))
+
+ out_file = tempfile.mktemp(dir=self._tmpdir)
+ params = [
+ '-f', script_path,
+ '-p', 'INPUT_CDX={}'.format(cdx_file),
+ '-p', 'INPUT_DIGEST={}'.format(digest_file),
+ '-p', 'OUTPUT={}'.format(out_file),
+ ] + pargs
+ status = self.run_pig_raw(params)
+ assert status.returncode == 0
+ # Capture all the part-r-* files
+ print("out_file: {}".format(out_file))
+ subprocess.run("/bin/ls -la {}/part-*".format(out_file), shell=True)
+ sub = subprocess.run("/bin/cat {}/part-*".format(out_file), stdout=subprocess.PIPE, shell=True)
+ out = sub.stdout.decode('utf-8')
+ print(out)
+ return out
+
+ # TODO: helper to verify that output matches an expected file
+
+ def test_thing(self):
+ r = self.run_pig_join("join-cdx-sha1.pig", "tests/files/example.cdx", "tests/files/example.sha1b32")
+ assert count_lines(r) == 4
diff --git a/please b/please
index 10fa843..74e9766 100755
--- a/please
+++ b/please
@@ -12,7 +12,7 @@ import subprocess
from datetime import datetime
HDFS_DIR = "hdfs:///user/bnewbold/sandcrawler"
-HBASE_HOST = "wbgrp-svc263.us.archive.org"
+HBASE_HOST = "wbgrp-svc350.us.archive.org"
ZOOKEEPER_HOSTS = "mtrcs-zk1.us.archive.org:2181"
GROBID_URI = "http://wbgrp-svc096.us.archive.org:8070"
@@ -487,6 +487,23 @@ def run_dumpungrobided(args):
env=args.env)
subprocess.call(cmd, shell=True)
+def run_sbackfill(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting scalding backfill job...")
+ cmd = """hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool sandcrawler.CdxBackfillJob \
+ --hdfs \
+ --app.conf.path scalding/ia_cluster.conf \
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --cdx-input-path {input_cdx}""".format(
+ input_cdx=args.input_cdx,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
+ subprocess.call(cmd, shell=True)
+
def main():
parser = argparse.ArgumentParser()
@@ -506,6 +523,11 @@ def main():
sub_backfill.add_argument('input_cdx',
help="full HDFS path of CDX file to backfill")
+ sub_sbackfill = subparsers.add_parser('sbackfill')
+ sub_sbackfill.set_defaults(func=run_sbackfill)
+ sub_sbackfill.add_argument('input_cdx',
+ help="full HDFS path of CDX file to backfill")
+
sub_extract = subparsers.add_parser('extract')
sub_extract.set_defaults(func=run_extract)
sub_extract.add_argument('input_cdx',
@@ -600,7 +622,7 @@ def main():
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do! (try --help)")
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
if not (args.prod or args.qa) or (args.prod and args.qa):
print("must pass one of --prod or --qa")
diff --git a/sandcrawler-rfc.md b/proposals/2018_original_sandcrawler_rfc.md
index fea6a7c..ecf7ab8 100644
--- a/sandcrawler-rfc.md
+++ b/proposals/2018_original_sandcrawler_rfc.md
@@ -73,7 +73,7 @@ process HTML and look for PDF outlinks, but wouldn't crawl recursively.
HBase is used for de-dupe, with records (pointers) stored in WARCs.
A second config would take seeds as entire journal websites, and would crawl
-continously.
+continuously.
Other components of the system "push" tasks to the crawlers by copying schedule
files into the crawl action directories.
diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md
new file mode 100644
index 0000000..768784f
--- /dev/null
+++ b/proposals/2019_ingest.md
@@ -0,0 +1,287 @@
+
+status: deployed
+
+This document proposes structure and systems for ingesting (crawling) paper
+PDFs and other content as part of sandcrawler.
+
+## Overview
+
+The main abstraction is a sandcrawler "ingest request" object, which can be
+created and submitted to one of several systems for automatic harvesting,
+resulting in an "ingest result" metadata object. This result should contain
+enough metadata to be automatically imported into fatcat as a file/release
+mapping.
+
+The structure and pipelines should be flexible enough to work with individual
+PDF files, web captures, and datasets. It should work for on-demand
+(interactive) ingest (for "save paper now" features), soft-real-time
+(hourly/daily/queued), batches of hundreds or thousands of requests, and scale
+up to batch ingest crawls of tens of millions of URLs. Most code should not
+care about how or when content is actually crawled.
+
+The motivation for this structure is to consolidate and automate the current ad
+hoc systems for crawling, matching, and importing into fatcat. It is likely
+that there will still be a few special cases with their own importers, but the
+goal is that in almost all cases that we discover a new structured source of
+content to ingest (eg, a new manifest of identifiers to URLs), we can quickly
+transform the task into a list of ingest requests, then submit those requests
+to an automated system to have them archived and inserted into fatcat with as
+little manual effort as possible.
+
+## Use Cases and Workflows
+
+### Unpaywall Example
+
+As a motivating example, consider how unpaywall crawls are done today:
+
+- download and archive JSON dump from unpaywall. transform and filter into a
+ TSV with DOI, URL, release-stage columns.
+- filter out previously crawled URLs from this seed file, based on last dump,
+ with the intent of not repeating crawls unnecessarily
+- run heritrix3 crawl, usually by sharding seedlist over multiple machines.
+ after crawl completes:
+ - backfill CDX PDF subset into hbase (for future de-dupe)
+ - generate CRL files etc and upload to archive items
+- run arabesque over complete crawl logs. this takes time, is somewhat manual,
+ and has scaling issues past a few million seeds
+- depending on source/context, run fatcat import with arabesque results
+- periodically run GROBID (and other transforms) over all new harvested files
+
+Issues with this are:
+
+- seedlist generation and arabesque step are toilsome (manual), and arabesque
+ likely has metadata issues or otherwise "leaks" content
+- brozzler pipeline is entirely separate
+- results in re-crawls of content already in wayback, in particular links
+ between large corpuses
+
+New plan:
+
+- download dump, filter, transform into ingest requests (mostly the same as
+ before)
+- load into ingest-request SQL table. only new rows (unique by source, type,
+ and URL) are loaded. run a SQL query for new rows from the source with URLs
+ that have not been ingested
+- (optional) pre-crawl bulk/direct URLs using heritrix3, as before, to reduce
+ later load on SPN
+- run ingest script over the above SQL output. ingest first hits CDX/wayback,
+ and falls back to SPNv2 (brozzler) for "hard" requests, or based on URL.
+ ingest worker handles file metadata, GROBID, any other processing. results go
+ to kafka, then SQL table
+- either do a bulk fatcat import (via join query), or just have workers
+ continuously import into fatcat from kafka ingest feed (with various quality
+ checks)
+
+## Request/Response Schema
+
+For now, plan is to have a single request type, and multiple similar but
+separate result types, depending on the ingest type (file, fileset,
+webcapture). The initial use case is single file PDF ingest.
+
+NOTE: what about crawl requests where we don't know if we will get a PDF or
+HTML? Or both? Let's just recrawl.
+
+*IngestRequest*
+ - `ingest_type`: required, one of `pdf`, `xml`, `html`, `dataset`. For
+ backwards compatibility, `file` should be interpreted as `pdf`. `pdf` and
+ `xml` return file ingest response; `html` and `dataset` not implemented but
+ would be webcapture (wayback) and fileset (archive.org item or wayback?).
+ In the future: `epub`, `video`, `git`, etc.
+ - `base_url`: required, where to start crawl process
+ - `link_source`: recommended, slug string. indicating the database or "authority"
+ where URL/identifier match is coming from (eg, `doi`, `pmc`, `unpaywall`
+ (doi), `s2` (semantic-scholar id), `spn` (fatcat release), `core` (CORE
+ id), `mag` (MAG id))
+ - `link_source_id`: recommended, identifier string. pairs with `link_source`.
+ - `ingest_request_source`: recommended, slug string. tracks the service or
+ user who submitted request. eg, `fatcat-changelog`, `editor_<ident>`,
+ `savepapernow-web`
+ - `release_stage`: optional. indicates the release stage of fulltext expected to be found at this URL
+ - `rel`: optional. indicates the link type
+ - `force_recrawl`: optional. if true, will always SPNv2 (won't check wayback)
+ - `oa_status`: optional. unpaywall schema
+ - `edit_extra`: additional metadata to be included in any eventual fatcat commits.
+ - `fatcat`
+ - `release_ident`: optional. if provided, indicates that ingest is expected
+ to be fulltext copy of this release (though may be a sibling release
+ under same work if `release_stage` doesn't match)
+ - `work_ident`: optional, unused. might eventually be used if, eg,
+ `release_stage` of ingested file doesn't match that of the `release_ident`
+ - `ext_ids`: matching fatcat schema. used for later lookups. sometimes
+ `link_source` and id are sufficient.
+ - `doi`
+ - `pmcid`
+ - ...
+
+*FileIngestResult*
+ - `request` (object): the full IngestRequest, copied
+ - `status` (slug): 'success', 'error', etc
+ - `hit` (boolean): whether we got something that looks like what was requested
+ - `terminal` (object): last crawled resource (if any)
+ - `terminal_url` (string; formerly `url`)
+ - `terminal_dt` (string): wayback capture datetime (string)
+ - `terminal_status_code`
+ - `terminal_sha1hex`: should match true `file_meta` SHA1 (not necessarily CDX SHA1)
+ (in case of transport encoding difference)
+ - `file_meta` (object): info about the terminal file
+ - same schema as sandcrawler-db table
+ - `size_bytes`
+ - `md5hex`
+ - `sha1hex`
+ - `sha256hex`
+ - `mimetype`: if not know, `application/octet-stream`
+ - `cdx`: CDX record matching terminal resource. *MAY* be a revisit or partial
+ record (eg, if via SPNv2)
+ - same schema as sandcrawler-db table
+ - `revisit_cdx` (optional): if `cdx` is a revisit record, this will be the
+ best "original" location for retrieval of the body (matching `flie_meta`)
+ - same schema as sandcrawler-db table
+ - `grobid`
+ - same schema as sandcrawler-db table
+ - `status` (string)
+ - `status_code` (int)
+ - `grobid_version` (string, from metadata)
+ - `fatcat_release` (string, from metadata)
+ - `metadata` (JSON) (with `grobid_version` and `fatcat_release` removed)
+ - NOT `tei_xml` (strip from reply)
+ - NOT `file_meta` (strip from reply)
+
+In general, it is the `terminal_dt` and `terminal_url` that should be used to
+construct wayback links (eg, for insertion to fatcat), not from the `cdx`.
+
+## New SQL Tables
+
+Sandcrawler should persist status about:
+
+- claimed locations (links) to fulltext copies of in-scope works, from indexes
+ like unpaywall, MAG, semantic scholar, CORE
+ - with enough context to help insert into fatcat if works are crawled and
+ found. eg, external identifier that is indexed in fatcat, and
+ release-stage
+- state of attempting to crawl all such links
+ - again, enough to insert into fatcat
+ - also info about when/how crawl happened, particularly for failures, so we
+ can do retries
+
+Proposing two tables:
+
+ -- source/source_id examples:
+ -- unpaywall / doi
+ -- mag / mag_id
+ -- core / core_id
+ -- s2 / semanticscholar_id
+ -- doi / doi (for any base_url which is just https://doi.org/10..., regardless of why enqueued)
+ -- pmc / pmcid (for any base_url like europmc.org, regardless of why enqueued)
+ -- arxiv / arxiv_id (for any base_url like arxiv.org, regardless of why enqueued)
+ CREATE TABLE IF NOT EXISTS ingest_request (
+ -- conceptually: source, source_id, ingest_type, url
+ -- but we use this order for PRIMARY KEY so we have a free index on type/URL
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(url) >= 1),
+ link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
+ link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),
+
+ created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ release_stage TEXT CHECK (octet_length(release_stage) >= 1),
+ request JSONB,
+ -- request isn't required, but can stash extra fields there for import, eg:
+ -- ext_ids (source/source_id sometimes enough)
+ -- release_ident (if ext_ids and source/source_id not specific enough; eg SPN)
+ -- edit_extra
+ -- rel
+ -- oa_status
+ -- ingest_request_source TEXT NOT NULL CHECK (octet_length(ingest_request_source) >= 1),
+
+ PRIMARY KEY (ingest_type, base_url, link_source, link_source_id)
+ );
+
+ CREATE TABLE IF NOT EXISTS ingest_file_result (
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(url) >= 1),
+
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ hit BOOLEAN NOT NULL,
+ status TEXT
+ terminal_url TEXT, INDEX
+ terminal_dt TEXT
+ terminal_status_code INT
+ terminal_sha1hex TEXT, INDEX
+
+ PRIMARY KEY (ingest_type, base_url)
+ );
+
+## New Kafka Topics
+
+- `sandcrawler-ENV.ingest-file-requests`
+- `sandcrawler-ENV.ingest-file-results`
+
+## Ingest Tool Design
+
+The basics of the ingest tool are to:
+
+- use native wayback python library to do fast/efficient lookups and redirect
+ lookups
+- starting from base-url, do a fetch to either target resource or landing page:
+ follow redirects, at terminus should have both CDX metadata and response body
+ - if no capture, or most recent is too old (based on request param), do
+ SPNv2 (brozzler) fetches before wayback lookups
+- if looking for PDF but got landing page (HTML), try to extract a PDF link
+ from HTML using various tricks, then do another fetch. limit this
+ recursion/spidering to just landing page (or at most one or two additional
+ hops)
+
+Note that if we pre-crawled with heritrix3 (with `citation_pdf_url` link
+following), then in the large majority of simple cases we
+
+## Design Issues
+
+### Open Questions
+
+Do direct aggregator/repositories crawls need to go through this process? Eg
+arxiv.org or pubmed central. I guess so, otherwise how do we get full file
+metadata (size, other hashes)?
+
+When recording hit status for a URL (ingest result), is that status dependent
+on the crawl context? Eg, for save-paper-now we might want to require GROBID.
+Semantics of `hit` should probably be consistent: if we got the filetype
+expected based on type, not whether we would actually import to fatcat.
+
+Where to include knowledge about, eg, single-page abstract PDFs being bogus? Do
+we just block crawling, set an ingest result status, or only filter at fatcat
+import time? Definitely need to filter at fatcat import time to make sure
+things don't slip through elsewhere.
+
+### Yet Another PDF Harvester
+
+This system could result in "yet another" set of publisher-specific heuristics
+and hacks to crawl publicly available papers. Related existing work includes
+[unpaywall's crawler][unpaywall_crawl], LOCKSS extraction code, dissem.in's
+efforts, zotero's bibliography extractor, etc. The "memento tracer" work is
+also similar. Many of these are even in python! It would be great to reduce
+duplicated work and maintenance. An analogous system in the wild is youtube-dl
+for downloading video from many sources.
+
+[unpaywall_crawl]: https://github.com/ourresearch/oadoi/blob/master/webpage.py
+[memento_tracer]: http://tracer.mementoweb.org/
+
+One argument against this would be that our use-case is closely tied to
+save-page-now, wayback, and the CDX API. However, a properly modular
+implementation of a paper downloader would allow components to be re-used, and
+perhaps dependency ingjection for things like HTTP fetches to allow use of SPN
+or similar. Another argument for modularity would be support for headless
+crawling (eg, brozzler).
+
+Note that this is an internal implementation detail; the ingest API would
+abstract all this.
+
+## Test Examples
+
+Some example works that are difficult to crawl. Should have mechanisms to crawl
+and unit tests for all these.
+
+- <https://pubs.acs.org>
+- <https://linkinghub.elsevier.com> / <https://sciencedirect.com>
+- <https://www.osapublishing.org/captcha/?guid=39B0E947-C0FC-B5D8-2C0C-CCF004FF16B8>
+- <https://utpjournals.press/action/cookieAbsent>
+- <https://academic.oup.com/jes/article/3/Supplement_1/SUN-203/5484104>
+- <http://www.jcancer.org/v10p4038.htm>
diff --git a/proposals/2019_pdftotext_pdfinfo.md b/proposals/2019_pdftotext_pdfinfo.md
new file mode 100644
index 0000000..ed731a4
--- /dev/null
+++ b/proposals/2019_pdftotext_pdfinfo.md
@@ -0,0 +1,123 @@
+
+status: brainstorming/backburner
+
+last updated: 2019-12-11
+
+This document proposes changes to extract text and metadata from PDFs at ingest
+time using pdftotext and pdfinfo, and storing this content in SQL and minio.
+
+This isn't a priority at the moment. Could be useful for fulltext search when
+GROBID fails, and the pdfinfo output might help with other quality checks.
+
+## Overview / Motivation
+
+`pdfinfo` and `pdftotext` can both be run quickly over raw PDFs. In
+sandcrawler, fetching PDFs can be a bit slow, so the motivation for caching the
+text is just to not have to fetch the PDFs over and over. Metadata is useful to
+store and index at scale.
+
+## pdfinfo output
+
+Example PDF info outputs:
+
+ Creator: PDF Suite 2010
+ Producer: PDF Suite 2010
+ CreationDate: Tue Sep 24 23:03:58 2013 PDT
+ ModDate: Tue Sep 24 23:03:58 2013 PDT
+ Tagged: no
+ UserProperties: no
+ Suspects: no
+ Form: none
+ JavaScript: no
+ Pages: 17
+ Encrypted: no
+ Page size: 612 x 792 pts (letter)
+ Page rot: 0
+ File size: 105400 bytes
+ Optimized: no
+ PDF version: 1.4
+
+another:
+
+ Title: Miscellanea Zoologica Hungarica 8. 1993 (Budapest, 1993)
+ Author: L. Forró szerk.
+ Producer: ABBYY FineReader 9.0 Corporate Edition
+ CreationDate: Wed Apr 13 05:30:21 2011 PDT
+ ModDate: Wed Apr 13 09:53:27 2011 PDT
+ Tagged: yes
+ UserProperties: no
+ Suspects: no
+ Form: AcroForm
+ JavaScript: no
+ Pages: 13
+ Encrypted: no
+ Page size: 473.76 x 678.42 pts
+ Page rot: 0
+ File size: 12047270 bytes
+ Optimized: no
+ PDF version: 1.6
+
+With the `-meta` flag, you get XML output, which also includes:
+
+ <xmpMM:DocumentID>uuid:cd1a8daa-61e1-48f4-b679-26eac52bb6a9</xmpMM:DocumentID>
+ <xmpMM:InstanceID>uuid:dea54c78-8bc6-4f2f-a665-4cd7e62457e7</xmpMM:InstanceID>
+
+The document id is particularly interesting for fatcat/sandcrawler. Apparently
+it is randomly created (or based on md5?) of first version of the file, and
+persists across edits. A quality check would be that all files with the same
+`document_id` should be clustered under the same fatcat work.
+
+All the info fields could probably be combined and used in categorization and
+filtering (ML or heuristic). Eg, a PDF with forms is probably not research
+output; published PDFs with specific "Producer" software probably are.
+
+## Fatcat Changes
+
+Could include in entity fields, a `pdfinfo` JSONB field, or existing `extra`:
+
+- pages
+- words
+- document id
+- page size
+- created
+- other meta (eg, PDF title, author, etc)
+
+All of these fields are, I assume, deterministic, thus appropriate for
+inclusion in fatcat.
+
+## New SQL Tables
+
+ CREATE TABLE IF NOT EXISTS pdftotext (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ tool_version TEXT CHECK (octet_length(tool_version) >= 1),
+ text_success BOOLEAN NOT NULL,
+ text_words INT,
+ info_success BOOLEAN NOT NULL,
+ pages INT,
+ pdf_created TIMESTAMP WITH TIME ZONE,
+ document_id TEXT CHECK (octet_length(document_id) >= 1), -- XXX: always UUID?
+ metadata JSONB
+ -- metadata contains any other stuff from pdfinfo:
+ -- title
+ -- author
+ -- pdf version
+ -- page size (?)
+ -- instance_id
+ );
+ -- CREATE INDEX pdftotext ON pdftotext(document_id);
+
+## New Kafka Topics
+
+ sandcrawler-ENV.pdftotext-output
+
+Key would be sha1hex of PDF.
+
+Schema would match the SQL table, plus the full raw PDF text output.
+
+## New Minio Stuff
+
+ /pdftotext/<hexbyte0>/<hexbyte1>/<sha1hex>.txt
+
+## Open Questions
+
diff --git a/proposals/20200129_pdf_ingest.md b/proposals/20200129_pdf_ingest.md
new file mode 100644
index 0000000..157607e
--- /dev/null
+++ b/proposals/20200129_pdf_ingest.md
@@ -0,0 +1,272 @@
+
+status: deployed
+
+2020q1 Fulltext PDF Ingest Plan
+===================================
+
+This document lays out a plan and tasks for a push on crawling and ingesting
+more fulltext PDF content in early 2020.
+
+The goal is to get the current generation of pipelines and matching tools
+running smoothly by the end of March, when the Mellon phase 1 grant ends. As a
+"soft" goal, would love to see over 25 million papers (works) with fulltext in
+fatcat by that deadline as well.
+
+This document is organized by conceptual approach, then by jobs to run and
+coding tasks needing work.
+
+There is a lot of work here!
+
+
+## Broad OA Ingest By External Identifier
+
+There are a few million papers in fatacat which:
+
+1. have a DOI, arxiv id, or pubmed central id, which can be followed to a
+ landing page or directly to a PDF
+2. are known OA, usually because publication is Gold OA
+3. don't have any fulltext PDF in fatcat
+
+As a detail, some of these "known OA" journals actually have embargoes (aka,
+they aren't true Gold OA). In particular, those marked via EZB OA "color", and
+recent pubmed central ids.
+
+Of these, I think there are broadly two categories. The first is just papers we
+haven't tried directly crawling or ingesting yet at all; these should be easy
+to crawl and ingest. The second category is papers from large publishers with
+difficult to crawl landing pages (for example, Elsevier, IEEE, Wiley, ACM). The
+later category will probably not crawl with heritrix, and we are likely to be
+rate-limited or resource constrained when using brozzler or
+
+Coding Tasks:
+
+- improve `fatcat_ingest.py` script to allow more granular slicing and limiting
+ the number of requests enqueued per batch (eg, to allow daily partial
+ big-publisher ingests in random order). Allow dumping arxiv+pmcid ingest
+ requests.
+
+Actions:
+
+- run broad Datacite DOI landing crawl with heritrix ("pre-ingest")
+- after Datacite crawl completes, run arabesque and ingest any PDF hits
+- run broad non-Datacite DOI landing crawl with heritrix. Use ingest tool to
+ generate (or filter a dump), removing Datacite DOIs and large publishers
+- after non-Datacite crawl completes, run entire ingest request set through in
+ bulk mode
+- start enqueing large-publisher (hard to crawl) OA DOIs to ingest queue
+ for SPNv2 crawling (blocking ingest tool improvement, and also SPNv2 health)
+- start new PUBMEDCENTRAL and ARXIV slow-burn pubmed crawls (heritrix). Use
+ updated ingest tool to generate requests.
+
+
+## Large Seedlist Crawl Iterations
+
+We have a bunch of large, high quality seedlists, most of which haven't been
+updated or crawled in a year or two. Some use DOIs as identifiers, some use an
+internal identifier. As a quick summary:
+
+- unpaywall: currently 25 million DOIs (Crossref only?) with fulltext. URLs may
+ be doi.org, publisher landing page, or direct PDF; may be published version,
+ pre-print, or manuscript (indicated with a flag). Only crawled with heritrix;
+ last crawl was Spring 2019. There is a new dump from late 2019 with a couple
+ million new papers/URLs.
+- microsoft academic (MAG): tens of millions of papers, hundreds of millions of
+ URLs. Last crawled 2018 (?) from a 2016 dump. Getting a new full dump via
+ Azure; new dump includes type info for each URL ("pdf", "landing page", etc).
+ Uses MAG id for each URL, not DOI; hoping new dump has better MAG/DOI
+ mappings. Expect a very large crawl (tens of millions of new URLs).
+- CORE: can do direct crawling of PDFs from their site, as well as external
+ URLs. They largely have pre-prints and IR content. Have not released a dump
+ in a long time. Would expect a couple million new direct (core.ac.uk) URLs
+ and fewer new web URLs (often overlap with other lists, like MAG)
+- semantic scholar: they do regular dumps. Use SHA1 hash of PDF as identifier;
+ it's the "best PDF of a group", so not always the PDF you crawl. Host many OA
+ PDFs on their domain, very fast to crawl, as well as wide-web URLs. Their
+ scope has increased dramatically in recent years due to MAG import; expect a
+ lot of overlap there.
+
+It is increasingly important to not
+
+Coding Tasks:
+- transform scripts for all these seedlist sources to create ingest request
+ lists
+- sandcrawler ingest request persist script, which supports setting datetime
+- fix HBase thrift gateway so url agnostic de-dupe can be updated
+- finish ingest worker "skip existing" code path, which looks in sandcrawler-db
+ to see if URL has already been processed (for efficiency)
+
+Actions:
+- transform and persist all these old seedlists, with the URL datetime set to
+ roughly when the URL was added to the upstream corpus
+- transform arabesque output for all old crawls into ingest requests and run
+ through the bulk ingest queue. expect GROBID to be skipped for all these, and
+ for the *requests* not to be updated (SQL ON CONFLICT DO NOTHING). Will
+ update ingest result table with status.
+- fetch new MAG and unpaywall seedlists, transform to ingest requests, persist
+ into ingest request table. use SQL to dump only the *new* URLs (not seen in
+ previous dumps) using the created timestamp, outputting new bulk ingest
+ request lists. if possible, de-dupe between these two. then start bulk
+ heritrix crawls over these two long lists. Probably sharded over several
+ machines. Could also run serially (first one, then the other, with
+ ingest/de-dupe in between). Filter out usual large sites (core, s2, arxiv,
+ pubmed, etc)
+- CORE and Semantic Scholar direct crawls, of only new URLs on their domain
+ (should not significantly conflict/dupe with other bulk crawls)
+
+After this round of big crawls completes we could do iterated crawling of
+smaller seedlists, re-visit URLs that failed to ingest with updated heritrix
+configs or the SPNv2 ingest tool, etc.
+
+
+## GROBID/glutton Matching of Known PDFs
+
+Of the many PDFs in the sandcrawler CDX "working set", many were broadly
+crawled or added via CDX heuristic. In other words, we don't have an identifier
+from a seedlist. We previously run a matching script in Hadoop that attempted
+to link these to Crossref DOIs based on GROBID extracted metadata. We haven't
+done this in a long time; in the meanwhile we have added many more such PDFs,
+added lots of metadata to our matching set (eg, pubmed and arxiv in addition to
+crossref), and have the new biblio-glutton tool for matching, which may work
+better than our old conservative tool.
+
+We have run GROBID+glutton over basically all of these PDFs. We should be able
+to do a SQL query to select PDFs that:
+
+- have at least one known CDX row
+- GROBID processed successfully and glutton matched to a fatcat release
+- do not have an existing fatcat file (based on sha1hex)
+- output GROBID metadata, `file_meta`, and one or more CDX rows
+
+An update match importer can take this output and create new file entities.
+Then lookup the release and confirm the match to the GROBID metadata, as well
+as any other quality checks, then import into fatcat. We have some existing
+filter code we could use. The verification code should be refactored into a
+reusable method.
+
+It isn't clear to me how many new files/matches we would get from this, but
+could do some test SQL queries to check. At least a million?
+
+A related task is to update the glutton lookup table (elasticsearch index and
+on-disk lookup tables) after more recent metadata imports (Datacite, etc).
+Unsure if we should filter out records or improve matching so that we don't
+match "header" (paper) metadata to non-paper records (like datasets), but still
+allow *reference* matching (citations to datasets).
+
+Coding Tasks:
+- write SQL select function. Optionally, come up with a way to get multiple CDX
+ rows in the output (sub-query?)
+- biblio metadata verify match function (between GROBID metadata and existing
+ fatcat release entity)
+- updated match fatcat importer
+
+Actions:
+- update `fatcat_file` sandcrawler table
+- check how many PDFs this might amount to. both by uniq SHA1 and uniq
+ `fatcat_release` matches
+- do some manual random QA verification to check that this method results in
+ quality content in fatcat
+- run full updated import
+
+
+## No-Identifier PDF New Release Import Pipeline
+
+Previously, as part of longtail OA crawling work, I took a set of PDFs crawled
+from OA journal homepages (where the publisher does not register DOIs), took
+successful GROBID metadata, filtered for metadata quality, and imported about
+1.5 million new release entities into fatcat.
+
+There were a number of metadata issues with this import that we are still
+cleaning up, eg:
+
+- paper actually did have a DOI and should have been associated with existing
+ fatcat release entity; these PDFs mostly came from repository sites which
+ aggregated many PDFs, or due to unintentional outlink crawl configs
+- no container linkage for any of these releases, making coverage tracking or
+ reporting difficult
+- many duplicates in same import set, due to near-identical PDFs (different by
+ SHA-1, but same content and metadata), not merged or grouped in any way
+
+The cleanup process is out of scope for this document, but we want to do
+another round of similar imports, while avoiding these problems.
+
+As a rouch sketch of what this would look like (may need to iterate):
+
+- filter to PDFs from longtail OA crawls (eg, based on WARC prefix, or URL domain)
+- filter to PDFs not in fatcat already (in sandcrawler, then verify with lookup)
+- filter to PDFs with successful GROBID extraction and *no* glutton match
+- filter/clean GROBID extracted metadata (in python, not SQL), removing stubs
+ or poor/partial extracts
+- run a fuzzy biblio metadata match against fatcat elasticsearch; use match
+ verification routine to check results
+- if fuzzy match was a hit, consider importing directly as a matched file
+ (especially if there are no existing files for the release)
+- identify container for PDF from any of: domain pattern/domain; GROBID
+ extracted ISSN or journal name; any other heuristic
+- if all these filters pass and there was no fuzzy release match, and there was
+ a container match, import a new release (and the file) into fatcat
+
+Not entirely clear how to solve the near-duplicate issue. Randomize import
+order (eg, sort by file sha1), import slowly with a single thread, and ensure
+elasticsearch re-indexing pipeline is running smoothly so the fuzzy match will
+find recently-imported hits?
+
+In theory we could use biblio-glutton API to do the matching lookups, but I
+think it will be almost as fast to hit our own elasticsearch index. Also the
+glutton backing store is always likely to be out of date. In the future we may
+even write something glutton-compatible that hits our index. Note that this is
+also very similar to how citation matching could work, though it might be
+derailing or over-engineering to come up with a single solution for both
+applications at this time.
+
+A potential issue here is that many of these papers are probably already in
+another large but non-authoritative metadata corpus, like MAG, CORE, SHARE, or
+BASE. Importing from those corpuses would want to go through the same fuzzy
+matching to ensure we aren't creating duplicate releases, but further it would
+be nice to be matching those external identifiers for any newly created
+releases. One approach would be to bulk-import metadata from those sources
+first. There are huge numbers of records in those corpuses, so we would need to
+filter down by journal/container or OA flag first. Another would be to do fuzzy
+matching when we *do* end up importing those corpuses, and update these records
+with the external identifiers. This issue really gets at the crux of a bunch of
+design issues and scaling problems with fatcat! But I think we should or need
+to make progress on these longtail OA imports without perfectly solving these
+larger issues.
+
+Details/Questions:
+- what about non-DOI metadata sources like MAG, CORE, SHARE, BASE? Should we
+ import those first, or do fuzzy matching against those?
+- use GROBID language detection and copy results to newly created releases
+- in single-threaded, could cache "recently matched/imported releases" locally
+ to prevent double-importing
+- cache container matching locally
+
+Coding Tasks:
+- write SQL select statement
+- iterate on GROBID metadata cleaning/transform/filter (have existing code for
+ this somewhere)
+- implement a "fuzzy match" routine that takes biblio metadata (eg, GROBID
+ extracted), looks in fatcat elasticsearch for a match
+- implement "fuzzy container match" routine, using as much available info as
+ possible. Could use chocula sqlite locally, or hit elasticsearch container
+ endpoint
+- update GROBID importer to use fuzzy match and other checks
+
+Actions:
+- run SQL select and estimate bounds on number of new releases created
+- do some manual randomized QA runs to ensure this pipeline is importing
+ quality content in fatcat
+- run a full batch import
+
+
+## Non-authoritative Metadata and Fulltext from Aggregators
+
+This is not fully thought through, but at some point we will probably add one
+or more large external aggregator metadata sources (MAG, Semantic Scholar,
+CORE, SHARE, BASE), and bulk import both metadata records and fulltext at the
+same time. The assumption is that those sources are doing the same fuzzy entity
+merging/de-dupe and crawling we are doing, but they have already done it
+(probably with more resources) and created stable identifiers that we can
+include.
+
+A major blocker for most such imports is metadata licensing (fatcat is CC0,
+others have restrictions). This may not be the case for CORE and SHARE though.
diff --git a/proposals/20200207_pdftrio.md b/proposals/20200207_pdftrio.md
new file mode 100644
index 0000000..6f6443f
--- /dev/null
+++ b/proposals/20200207_pdftrio.md
@@ -0,0 +1,107 @@
+
+status: deployed
+
+NOTE: while this has been used in production, as of December 2022 the results
+are not used much in practice, and we don't score every PDF that comes along
+
+PDF Trio (ML Classification)
+==============================
+
+This document describes how we intent to integrate the first generation of PDF
+classification work into the sandcrawler processing system.
+
+- abstractions (APIs)
+- schemas
+- how models and dependencies are deployed
+- what code is release where under what license
+
+
+## Code Structure
+
+Major components:
+
+**Training code, documentation, datasets:** Not used at run-time (does not need
+to be deployed). Should be public. The datasets (PDFs) are copyrighted, so we
+should only release URL lists that point to wayback.
+
+**Models:** all are static, uploaded to archive.org items, simple download to
+deploy. Should be versioned, and have unique versioned file names or directory
+paths (aka, deploy in parallel).
+
+**Image classifier backend:** vanilla tensorflow serving docker image, with a
+bunch of invocation configs, plus static models.
+
+**BERT backend:** vanilla tensorflow serving docker image, plus config, plus
+models. Basically same as image classifier.
+
+**API service:** currently Flask. Depends on tools like imagemagik, fasttext,
+pdftotext. Seems like apt+pipenv should work?
+
+
+## API Refactors
+
+Changes:
+
+- probably re-write README?
+- refactor python code into directories
+- add python tests
+- tweak schema
+- proper parallelization: uwsgi? async?
+
+New features:
+
+- option to send images, raw text in batches in addition to PDFs.
+
+## Client Code
+
+Basically just like GROBID client for now. Requests, JSON.
+
+## JSON Schema
+
+Output that goes in Kafka topic:
+
+ key (sha1hex)
+ pdf_trio
+ status
+ status_code
+ ensemble_score
+ bert_score
+ image_score
+ linear_score
+ versions
+ pdftrio_version (string)
+ models_date (string, ISO date)
+ git_rev (string)
+ bert_model (string)
+ image_model (string)
+ linear_model (string)
+ timing (optional/future: as reported by API)
+ ...
+ file_meta
+ sha1hex
+ ...
+ timing
+ ...
+
+
+## SQL Schema
+
+Ensemble model versions are summarized as a date.
+
+ CREATE TABLE IF NOT EXISTS pdftrio (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status_code INT NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1),
+ models_date DATE,
+ ensemble_score REAL,
+ bert_score REAL,
+ linear_score REAL,
+ image_score REAL
+ );
+
+## Kafka Topic
+
+sandcrawler-qa.pdftrio-output
+
diff --git a/proposals/20200211_nsq.md b/proposals/20200211_nsq.md
new file mode 100644
index 0000000..6aa885b
--- /dev/null
+++ b/proposals/20200211_nsq.md
@@ -0,0 +1,79 @@
+
+status: planned
+
+In short, Kafka is not working well as a job task scheduler, and I want to try
+NSQ as a medium-term solution to that problem.
+
+
+## Motivation
+
+Thinking of setting up NSQ to use for scheduling distributed work, to replace
+kafka for some topics. for example, "regrobid" requests where we enqueue
+millions of, basically, CDX lines, and want to process on dozens of cores or
+multiple machines. or file ingest backfill. results would still go to kafka (to
+persist), and pipelines like DOI harvest -> import -> elasticsearch would still
+be kafka
+
+The pain point with kafka is having dozens of workers on tasks that take more
+than a couple seconds per task. we could keep tweaking kafka and writing weird
+consumer group things to handle this, but I think it will never work very well.
+NSQ supports re-queues with delay (eg, on failure, defer to re-process later),
+allows many workers to connect and leave with no disruption, messages don't
+have to be processed in order, and has a very simple enqueue API (HTTP POST).
+
+The slowish tasks we have now are file ingest (wayback and/or SPNv2 +
+GROBID) and re-GROBID. In the near future will also have ML backlog to go
+through.
+
+Throughput isn't much of a concern as tasks take 10+ seconds each.
+
+
+## Specific Plan
+
+Continue publishing ingest requests to Kafka topic. Have a new persist worker
+consume from this topic and push to request table (but not result table) using
+`ON CONFLICT DO NOTHING`. Have a new single-process kafka consumer pull from
+the topic and push to NSQ. This consumer monitors NSQ and doesn't push too many
+requests (eg, 1k maximum). NSQ could potentially even run as in-memory mode.
+New worker/pusher class that acts as an NSQ client, possibly with parallelism.
+
+*Clean* NSQ shutdown/restart always persists data locally to disk.
+
+Unclean shutdown (eg, power failure) would mean NSQ might have lost state.
+Because we are persisting requests to sandcrawler-db, cleanup is simple:
+re-enqueue all requests from the past N days with null result or result older
+than M days.
+
+Still need multiple kafka and NSQ topics to have priority queues (eg, bulk,
+platform-specific).
+
+To start, have a single static NSQ host; don't need nsqlookupd. Could use
+wbgrp-svc506 (datanode VM with SSD, lots of CPU and RAM).
+
+To move hosts, simply restart the kafka pusher pointing at the new NSQ host.
+When the old host's queue is empty, restart the workers to consume from the new
+host, and destroy the old NSQ host.
+
+
+## Alternatives
+
+Work arounds i've done to date have been using the `grobid_tool.py` or
+`ingest_tool.py` JSON input modes to pipe JSON task files (millions of lines)
+through GNU/parallel. I guess GNU/parallel's distributed mode is also an option
+here.
+
+Other things that could be used:
+
+**celery**: popular, many features. need to run separate redis, no disk persistence (?)
+
+**disque**: need to run redis, no disk persistence (?) <https://github.com/antirez/disque>
+
+**gearman**: <http://gearman.org/> no disk persistence (?)
+
+
+## Old Notes
+
+TBD if would want to switch ingest requests from fatcat -> sandcrawler over,
+and have the continuous ingests run out of NSQ, or keep using kafka for that.
+currently can only do up to 10x parallelism or so with SPNv2, so that isn't a
+scaling pain point
diff --git a/proposals/20201012_no_capture.md b/proposals/20201012_no_capture.md
new file mode 100644
index 0000000..7f6a1f5
--- /dev/null
+++ b/proposals/20201012_no_capture.md
@@ -0,0 +1,39 @@
+
+status: work-in-progress
+
+NOTE: as of December 2022, bnewbold can't remember if this was fully
+implemented or not.
+
+Storing no-capture missing URLs in `terminal_url`
+=================================================
+
+Currently, when the bulk-mode ingest code terminates with a `no-capture`
+status, the missing URL (which is not in GWB CDX) is not stored in
+sandcrawler-db. This proposed change is to include it in the existing
+`terminal_url` database column, with the `terminal_status_code` and
+`terminal_dt` columns empty.
+
+The implementation is rather simple:
+
+- CDX lookup code path should save the *actual* final missing URL (`next_url`
+ after redirects) in the result object's `terminal_url` field
+- ensure that this field gets passed through all the way to the database on the
+ `no-capture` code path
+
+This change does change the semantics of the `terminal_url` field somewhat, and
+could break existing assumptions, so it is being documented in this proposal
+document.
+
+
+## Alternatives
+
+The current status quo is to store the missing URL as the last element in the
+"hops" field of the JSON structure. We could keep this and have a convoluted
+pipeline that would read from the Kafka feed and extract them, but this would
+be messy. Eg, re-ingesting would not update the old kafka messages, so we could
+need some accounting of consumer group offsets after which missing URLs are
+truly missing.
+
+We could add a new `missing_url` database column and field to the JSON schema,
+for this specific use case. This seems like unnecessary extra work.
+
diff --git a/proposals/20201026_html_ingest.md b/proposals/20201026_html_ingest.md
new file mode 100644
index 0000000..785471b
--- /dev/null
+++ b/proposals/20201026_html_ingest.md
@@ -0,0 +1,129 @@
+
+status: deployed
+
+HTML Ingest Pipeline
+========================
+
+Basic goal: given an ingest request of type 'html', output an object (JSON)
+which could be imported into fatcat.
+
+Should work with things like (scholarly) blog posts, micropubs, registrations,
+protocols. Doesn't need to work with everything to start. "Platform" sites
+(like youtube, figshare, etc) will probably be a different ingest worker.
+
+A current unknown is what the expected size of this metadata is. Both in number
+of documents and amount of metadata per document.
+
+Example HTML articles to start testing:
+
+- complex distill article: <https://distill.pub/2020/bayesian-optimization/>
+- old HTML journal: <http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm>
+- NIH pub: <https://www.nlm.nih.gov/pubs/techbull/ja02/ja02_locatorplus_merge.html>
+- first mondays (OJS): <https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729>
+- d-lib: <http://www.dlib.org/dlib/july17/williams/07williams.html>
+
+
+## Ingest Process
+
+Follow base URL to terminal document, which is assumed to be a status=200 HTML document.
+
+Verify that terminal document is fulltext. Extract both metadata and fulltext.
+
+Extract list of sub-resources. Filter out unwanted (eg favicon, analytics,
+unnecessary), apply a sanity limit. Convert to fully qualified URLs. For each
+sub-resource, fetch down to the terminal resource, and compute hashes/metadata.
+
+Open questions:
+
+- will probably want to parallelize sub-resource fetching. async?
+- behavior when failure fetching sub-resources
+
+
+## Ingest Result Schema
+
+JSON should be basically compatible with existing `ingest_file_result` objects,
+with some new sub-objects.
+
+Overall object (`IngestWebResult`):
+
+- `status`: str
+- `hit`: bool
+- `error_message`: optional, if an error
+- `hops`: optional, array of URLs
+- `cdx`: optional; single CDX row of primary HTML document
+- `terminal`: optional; same as ingest result
+ - `terminal_url`
+ - `terminal_dt`
+ - `terminal_status_code`
+ - `terminal_sha1hex`
+- `request`: optional but usually present; ingest request object, verbatim
+- `file_meta`: optional; file metadata about primary HTML document
+- `html_biblio`: optional; extracted biblio metadata from primary HTML document
+- `scope`: optional; detected/guessed scope (fulltext, etc)
+- `html_resources`: optional; array of sub-resources. primary HTML is not included
+- `html_body`: optional; just the status code and some metadata is passed through;
+ actual document would go through a different KafkaTopic
+ - `status`: str
+ - `agent`: str, eg "trafilatura/0.4"
+ - `tei_xml`: optional, str
+ - `word_count`: optional, str
+
+
+## New SQL Tables
+
+`html_meta`
+ sha1hex (primary key)
+ updated (of SQL row)
+ status
+ scope
+ has_teixml
+ has_thumbnail
+ word_count (from teixml fulltext)
+ biblio (JSON)
+ resources (JSON)
+
+Also writes to `ingest_file_result`, `file_meta`, and `cdx`, all only for the base HTML document.
+
+Note: needed to enable postgrest access to this table (for scholar worker).
+
+
+## Fatcat API Wants
+
+Would be nice to have lookup by SURT+timestamp, and/or by sha1hex of terminal base file.
+
+`hide` option for cdx rows; also for fileset equivalent.
+
+
+## New Workers
+
+Could reuse existing worker, have code branch depending on type of ingest.
+
+ingest file worker
+ => same as existing worker, because could be calling SPN
+
+persist result
+ => same as existing worker; adds persisting various HTML metadata
+
+persist html text
+ => talks to seaweedfs
+
+
+## New Kafka Topics
+
+HTML ingest result topic (webcapture-ish)
+
+sandcrawler-ENV.html-teixml
+ JSON wrapping TEI-XML (same as other fulltext topics)
+ key compaction and content compression enabled
+
+JSON schema:
+
+- `key` and `sha1hex`: str; used as kafka key
+- `status`: str
+- `tei_xml`: str, optional
+- `word_count`: int, optional
+
+## New S3/SeaweedFS Content
+
+`sandcrawler` bucket, `html` folder, `.tei.xml` suffix.
+
diff --git a/proposals/20201103_xml_ingest.md b/proposals/20201103_xml_ingest.md
new file mode 100644
index 0000000..34e00b0
--- /dev/null
+++ b/proposals/20201103_xml_ingest.md
@@ -0,0 +1,64 @@
+
+status: deployed
+
+XML Fulltext Ingest
+====================
+
+This document details changes to include XML fulltext ingest in the same way
+that we currently ingest PDF fulltext.
+
+Currently this will just fetch the single XML document, which is often lacking
+figures, tables, and other required files.
+
+## Text Encoding
+
+Because we would like to treat XML as a string in a couple contexts, but XML
+can have multiple encodings (indicated in an XML header), we are in a bit of a
+bind. Simply parsing into unicode and then re-encoding as UTF-8 could result in
+a header/content mismatch. Any form of re-encoding will change the hash of the
+document. For recording in fatcat, the file metadata will be passed through.
+For storing in Kafka and blob store (for downstream analysis), we will parse
+the raw XML document (as "bytes") with an XML parser, then re-output with UTF-8
+encoding. The hash of the *original* XML file will be used as the key for
+referring to this document. This is unintuitive, but similar to what we are
+doing with PDF and HTML documents (extracting in a useful format, but keeping
+the original document's hash as a key).
+
+Unclear if we need to do this re-encode process for XML documents already in
+UTF-8 encoding.
+
+## Ingest Worker
+
+Could either re-use HTML metadata extractor to fetch XML fulltext links, or
+fork that code off to a separate method, like the PDF fulltext URL extractor.
+
+Hopefully can re-use almost all of the PDF pipeline code, by making that ingest
+worker class more generic and subclassing it.
+
+Result objects are treated the same as PDF ingest results: the result object
+has context about status, and if successful, file metadata and CDX row of the
+terminal object.
+
+TODO: should it be assumed that XML fulltext will end up in S3 bucket? or
+should there be an `xml_meta` SQL table tracking this, like we have for PDFs
+and HTML?
+
+TODO: should we detect and specify the XML schema better? Eg, indicate if JATS.
+
+
+## Persist Pipeline
+
+### Kafka Topic
+
+sandcrawler-ENV.xml-doc
+ similar to other fulltext topics; JSON wrapping the XML
+ key compaction, content compression
+
+### S3/SeaweedFS
+
+`sandcrawler` bucket, `xml` folder. Extension could depend on sub-type of XML?
+
+### Persist Worker
+
+New S3-only worker that pulls from kafka topic and pushes to S3. Works
+basically the same as PDF persist in S3-only mode, or like pdf-text worker.
diff --git a/proposals/2020_pdf_meta_thumbnails.md b/proposals/2020_pdf_meta_thumbnails.md
new file mode 100644
index 0000000..141ece8
--- /dev/null
+++ b/proposals/2020_pdf_meta_thumbnails.md
@@ -0,0 +1,328 @@
+
+status: deployed
+
+New PDF derivatives: thumbnails, metadata, raw text
+===================================================
+
+To support scholar.archive.org (fulltext search) and other downstream uses of
+fatcat, want to extract from many PDFs:
+
+- pdf structured metadata
+- thumbnail images
+- raw extracted text
+
+A single worker should extract all of these fields, and publish in to two kafka
+streams. Separate persist workers consume from the streams and push in to SQL
+and/or seaweedfs.
+
+Additionally, this extraction should happen automatically for newly-crawled
+PDFs as part of the ingest pipeline. When possible, checks should be run
+against the existing SQL table to avoid duplication of processing.
+
+
+## PDF Metadata and Text
+
+Kafka topic (name: `sandcrawler-ENV.pdf-text`; 12x partitions; gzip
+compression) JSON schema:
+
+ sha1hex (string; used as key)
+ status (string)
+ text (string)
+ page0_thumbnail (boolean)
+ meta_xml (string)
+ pdf_info (object)
+ pdf_extra (object)
+ word_count
+ file_meta (object)
+ source (object)
+
+For the SQL table we should have columns for metadata fields that are *always*
+saved, and put a subset of other interesting fields in a JSON blob. We don't
+need all metadata fields in SQL. Full metadata/info will always be available in
+Kafka, and we don't want SQL table size to explode. Schema:
+
+ CREATE TABLE IF NOT EXISTS pdf_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ has_page0_thumbnail BOOLEAN NOT NULL,
+ page_count INT CHECK (page_count >= 0),
+ word_count INT CHECK (word_count >= 0),
+ page0_height REAL CHECK (page0_height >= 0),
+ page0_width REAL CHECK (page0_width >= 0),
+ permanent_id TEXT CHECK (octet_length(permanent_id) >= 1),
+ pdf_created TIMESTAMP WITH TIME ZONE,
+ pdf_version TEXT CHECK (octet_length(pdf_version) >= 1),
+ metadata JSONB
+ -- maybe some analysis of available fields?
+ -- metadata JSON fields:
+ -- title
+ -- subject
+ -- author
+ -- creator
+ -- producer
+ -- CrossMarkDomains
+ -- doi
+ -- form
+ -- encrypted
+ );
+
+
+## Thumbnail Images
+
+Kafka Schema is raw image bytes as message body; sha1sum of PDF as the key. No
+compression, 12x partitions.
+
+Kafka topic name is `sandcrawler-ENV.pdf-thumbnail-SIZE-TYPE` (eg,
+`sandcrawler-qa.pdf-thumbnail-180px-jpg`). Thus, topic name contains the
+"metadata" of thumbail size/shape.
+
+Have decided to use JPEG thumbnails, 180px wide (and max 300px high, though
+width restriction is almost always the limiting factor). This size matches that
+used on archive.org, and is slightly larger than the thumbnails currently used
+on scholar.archive.org prototype. We intend to tweak the scholar.archive.org
+CSS to use the full/raw thumbnail image at max desktop size. At this size it
+would be difficult (though maybe not impossible?) to extract text (other than
+large-font titles).
+
+
+### Implementation
+
+We use the `poppler` CPP library (wrapper for python) to extract and convert everything.
+
+Some example usage of the `python-poppler` library:
+
+ import poppler
+ from PIL import Image
+
+ pdf = poppler.load_from_file("/home/bnewbold/10.1038@s41551-020-0534-9.pdf")
+ pdf.pdf_id
+ page = pdf.create_page(0)
+ page.page_rect().width
+
+ renderer = poppler.PageRenderer()
+ full_page = renderer.render_page(page)
+ img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "RGBA")
+ img.thumbnail((180,300), Image.BICUBIC)
+ img.save("something.jpg")
+
+## Deployment and Infrastructure
+
+Deployment will involve:
+
+- sandcrawler DB SQL table
+ => guesstimate size 100 GByte for hundreds of PDFs
+- postgrest/SQL access to new table for internal HTTP API hits
+- seaweedfs raw text folder
+ => reuse existing bucket with GROBID XML; same access restrictions on content
+- seaweedfs thumbnail bucket
+ => new bucket for this world-public content
+- public nginx access to seaweed thumbnail bucket
+- extraction work queue kafka topic
+ => same schema/semantics as ungrobided
+- text/metadata kafka topic
+- thumbnail kafka topic
+- text/metadata persist worker(s)
+ => from kafka; metadata to SQL database; text to seaweedfs blob store
+- thumbnail persist worker
+ => from kafka to seaweedfs blob store
+- pdf extraction worker pool
+ => very similar to GROBID worker pool
+- ansible roles for all of the above
+
+Plan for processing/catchup is:
+
+- test with COVID-19 PDF corpus
+- run extraction on all current fatcat files available via IA
+- integrate with ingest pipeline for all new files
+- run a batch catchup job over all GROBID-parsed files with no pdf meta
+ extracted, on basis of SQL table query
+
+## Appendix: Thumbnail Size and Format Experimentation
+
+Using 190 PDFs from `/data/pdfs/random_crawl/files` on my laptop to test.
+
+TODO: actually, 4x images failed to convert with pdftocairo; this throws off
+"mean" sizes by a small amount.
+
+ time ls | parallel -j1 pdftocairo -singlefile -scale-to 200 -png {} /tmp/test-png/{}.png
+ real 0m29.314s
+ user 0m26.794s
+ sys 0m2.484s
+ => missing: 4
+ => min: 0.8k
+ => max: 57K
+ => mean: 16.4K
+ => total: 3120K
+
+ time ls | parallel -j1 pdftocairo -singlefile -scale-to 200 -jpeg {} /tmp/test-jpeg/{}.jpg
+ real 0m26.289s
+ user 0m24.022s
+ sys 0m2.490s
+ => missing: 4
+ => min: 1.2K
+ => max: 13K
+ => mean: 8.02k
+ => total: 1524K
+
+ time ls | parallel -j1 pdftocairo -singlefile -scale-to 200 -jpeg -jpegopt optimize=y,quality=80 {} /tmp/test-jpeg2/{}.jpg
+ real 0m27.401s
+ user 0m24.941s
+ sys 0m2.519s
+ => missing: 4
+ => min: 577
+ => max: 14K
+ => mean:
+ => total: 1540K
+
+ time ls | parallel -j1 convert -resize 200x200 {}[0] /tmp/magick-png/{}.png
+ => missing: 4
+ real 1m19.399s
+ user 1m17.150s
+ sys 0m6.322s
+ => min: 1.1K
+ => max: 325K
+ => mean:
+ => total: 8476K
+
+ time ls | parallel -j1 convert -resize 200x200 {}[0] /tmp/magick-jpeg/{}.jpg
+ real 1m21.766s
+ user 1m17.040s
+ sys 0m7.155s
+ => total: 3484K
+
+NOTE: the following `pdf_thumbnail.py` images are somewhat smaller than the above
+jpg and pngs (max 180px wide, not 200px wide)
+
+ time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-png/{}.png
+ real 0m48.198s
+ user 0m42.997s
+ sys 0m4.509s
+ => missing: 2; 2x additional stub images
+ => total: 5904K
+
+ time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-jpg/{}.jpg
+ real 0m45.252s
+ user 0m41.232s
+ sys 0m4.273s
+ => min: 1.4K
+ => max: 16K
+ => mean: ~9.3KByte
+ => total: 1772K
+
+ time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-jpg-360/{}.jpg
+ real 0m48.639s
+ user 0m44.121s
+ sys 0m4.568s
+ => mean: ~28k
+ => total: 5364K (3x of 180px batch)
+
+ quality=95
+ time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-jpg2-360/{}.jpg
+ real 0m49.407s
+ user 0m44.607s
+ sys 0m4.869s
+ => total: 9812K
+
+ quality=95
+ time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-jpg2-180/{}.jpg
+ real 0m45.901s
+ user 0m41.486s
+ sys 0m4.591s
+ => mean: 16.4K
+ => total: 3116K
+
+At the 180px size, the difference between default and quality=95 seems
+indistinguishable visually to me, but is more than a doubling of file size.
+Also tried at 300px and seems near-indistinguishable there as well.
+
+At a mean of 10 Kbytes per file:
+
+ 10 million -> 100 GBytes
+ 100 million -> 1 Tbyte
+
+Older COVID-19 thumbnails were about 400px wide:
+
+ pdftocairo -png -singlefile -scale-to-x 400 -scale-to-y -1
+
+Display on scholar-qa.archive.org is about 135x181px
+
+archive.org does 180px wide
+
+Unclear if we should try to do double resolution for high DPI screens (eg,
+apple "retina").
+
+Using same size as archive.org probably makes the most sense: max 180px wide,
+preserve aspect ratio. And jpeg improvement seems worth it.
+
+#### Merlijn notes
+
+From work on optimizing microfilm thumbnail images:
+
+ When possible, generate a thumbnail that fits well on the screen of the
+ user. Always creating a large thumbnail will result in the browsers
+ downscaling them, leading to fuzzy text. If it’s not possible, then create
+ the pick the resolution you’d want to support (1.5x or 2x scaling) and
+ create thumbnails of that size, but also apply the other recommendations
+ below - especially a sharpening filter.
+
+ Use bicubic or lanczos interpolation. Bilinear and nearest neighbour are
+ not OK.
+
+ For text, consider applying a sharpening filter. Not a strong one, but some
+ sharpening can definitely help.
+
+
+## Appendix: PDF Info Fields
+
+From `pdfinfo` manpage:
+
+ The ´Info' dictionary contains the following values:
+
+ title
+ subject
+ keywords
+ author
+ creator
+ producer
+ creation date
+ modification date
+
+ In addition, the following information is printed:
+
+ tagged (yes/no)
+ form (AcroForm / XFA / none)
+ javascript (yes/no)
+ page count
+ encrypted flag (yes/no)
+ print and copy permissions (if encrypted)
+ page size
+ file size
+ linearized (yes/no)
+ PDF version
+ metadata (only if requested)
+
+For an example file, the output looks like:
+
+ Title: A mountable toilet system for personalized health monitoring via the analysis of excreta
+ Subject: Nature Biomedical Engineering, doi:10.1038/s41551-020-0534-9
+ Keywords:
+ Author: Seung-min Park
+ Creator: Springer
+ CreationDate: Thu Mar 26 01:26:57 2020 PDT
+ ModDate: Thu Mar 26 01:28:06 2020 PDT
+ Tagged: no
+ UserProperties: no
+ Suspects: no
+ Form: AcroForm
+ JavaScript: no
+ Pages: 14
+ Encrypted: no
+ Page size: 595.276 x 790.866 pts
+ Page rot: 0
+ File size: 6104749 bytes
+ Optimized: yes
+ PDF version: 1.4
+
+For context on the `pdf_id` fields ("original" and "updated"), read:
+<https://web.hypothes.is/blog/synchronizing-annotations-between-local-and-remote-pdfs/>
diff --git a/proposals/2020_seaweed_s3.md b/proposals/2020_seaweed_s3.md
new file mode 100644
index 0000000..677393b
--- /dev/null
+++ b/proposals/2020_seaweed_s3.md
@@ -0,0 +1,426 @@
+# Notes on seaweedfs
+
+> 2020-04-28, martin@archive.org
+
+Currently (04/2020) [minio](https://github.com/minio/minio) is used to store
+output from PDF analysis for [fatcat](https://fatcat.wiki) (e.g. from
+[grobid](https://grobid.readthedocs.io/en/latest/)). The file checksum (sha1)
+serves as key, values are blobs of XML or JSON.
+
+Problem: minio inserts slowed down after inserting 80M or more objects.
+
+Summary: I did four test runs, three failed, one (testrun-4) succeeded.
+
+* [testrun-4](https://git.archive.org/webgroup/sandcrawler/-/blob/master/proposals/2020_seaweed_s3.md#testrun-4)
+
+So far, in a non-distributed mode, the project looks usable. Added 200M objects
+(about 550G) in 6 days. Full CPU load, 400M RAM usage, constant insert times.
+
+----
+
+Details (03/2020) / @bnewbold, slack
+
+> the sandcrawler XML data store (currently on aitio) is grinding to a halt, I
+> think because despite tuning minio+ext4+hdd just doesn't work. current at 2.6
+> TiB of data (each document compressed with snappy) and 87,403,183 objects.
+
+> this doesn't impact ingest processing (because content is queued and archived
+> in kafka), but does impact processing and analysis
+
+> it is possible that the other load on aitio is making this worse, but I did
+> an experiment with dumping to a 16 TB disk that slowed way down after about
+> 50 million files also. some people on the internet said to just not worry
+> about these huge file counts on modern filesystems, but i've debugged a bit
+> and I think it is a bad idea after all
+
+Possible solutions
+
+* putting content in fake WARCs and trying to do something like CDX
+* deploy CEPH object store (or swift, or any other off-the-shelf object store)
+* try putting the files in postgres tables, mongodb, cassandra, etc: these are
+ not designed for hundreds of millions of ~50 KByte XML documents (5 - 500
+ KByte range)
+* try to find or adapt an open source tool like Haystack, Facebook's solution
+ to this engineering problem. eg:
+ https://engineering.linkedin.com/blog/2016/05/introducing-and-open-sourcing-ambry---linkedins-new-distributed-
+
+----
+
+The following are notes gathered during a few test runs of seaweedfs in 04/2020
+on wbgrp-svc170.us.archive.org (4 core E5-2620 v4, 4GB RAM).
+
+----
+
+## Setup
+
+There are frequent [releases](https://github.com/chrislusf/seaweedfs/releases)
+but for the test, we used a build off master branch.
+
+Directions for configuring AWS CLI for seaweedfs:
+[https://github.com/chrislusf/seaweedfs/wiki/AWS-CLI-with-SeaweedFS](https://github.com/chrislusf/seaweedfs/wiki/AWS-CLI-with-SeaweedFS).
+
+### Build the binary
+
+Using development version (requires a [Go installation](https://golang.org/dl/)).
+
+```
+$ git clone git@github.com:chrislusf/seaweedfs.git # 11f5a6d9
+$ cd seaweedfs
+$ make
+$ ls -lah weed/weed
+-rwxr-xr-x 1 tir tir 55M Apr 17 16:57 weed
+
+$ git rev-parse HEAD
+11f5a6d91346e5f3cbf3b46e0a660e231c5c2998
+
+$ sha1sum weed/weed
+a7f8f0b49e6183da06fc2d1411c7a0714a2cc96b
+```
+
+A single, 55M binary emerges after a few seconds. The binary contains
+subcommands to run different parts of seaweed, e.g. master or volume servers,
+filer and commands for maintenance tasks, like backup and compaction.
+
+To *deploy*, just copy this binary to the destination.
+
+### Quickstart with S3
+
+Assuming `weed` binary is in PATH.
+
+Start a master and volume server (over /tmp, most likely) and the S3 API with a single command:
+
+```
+$ weed -server s3
+...
+Start Seaweed Master 30GB 1.74 at 0.0.0.0:9333
+...
+Store started on dir: /tmp with 0 volumes max 7
+Store started on dir: /tmp with 0 ec shards
+Volume server start with seed master nodes: [localhost:9333]
+...
+Start Seaweed S3 API Server 30GB 1.74 at http port 8333
+...
+```
+
+Install the [AWS
+CLI](https://github.com/chrislusf/seaweedfs/wiki/AWS-CLI-with-SeaweedFS).
+Create a bucket.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 mb s3://sandcrawler-dev
+make_bucket: sandcrawler-dev
+```
+
+List buckets.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 ls
+2020-04-17 17:44:39 sandcrawler-dev
+```
+
+Create a dummy file.
+
+```
+$ echo "blob" > 12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml
+```
+
+Upload.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 cp 12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml s3://sandcrawler-dev
+upload: ./12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml to s3://sandcrawler-dev/12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml
+```
+
+List.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 ls s3://sandcrawler-dev
+2020-04-17 17:50:35 5 12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml
+```
+
+Stream to stdout.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 cp s3://sandcrawler-dev/12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml -
+blob
+```
+
+Drop the bucket.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 rm --recursive s3://sandcrawler-dev
+```
+
+### Builtin benchmark
+
+The project comes with a builtin benchmark command.
+
+```
+$ weed benchmark
+```
+
+I encountered an error like
+[#181](https://github.com/chrislusf/seaweedfs/issues/181), "no free volume
+left" - when trying to start the benchmark after the S3 ops. A restart or a restart with `-volume.max 100` helped.
+
+```
+$ weed server -s3 -volume.max 100
+```
+
+### Listing volumes
+
+```
+$ weed shell
+> volume.list
+Topology volume:15/112757 active:8 free:112742 remote:0 volumeSizeLimit:100 MB
+ DataCenter DefaultDataCenter volume:15/112757 active:8 free:112742 remote:0
+ Rack DefaultRack volume:15/112757 active:8 free:112742 remote:0
+ DataNode localhost:8080 volume:15/112757 active:8 free:112742 remote:0
+ volume id:1 size:105328040 collection:"test" file_count:33933 version:3 modified_at_second:1587215730
+ volume id:2 size:106268552 collection:"test" file_count:34236 version:3 modified_at_second:1587215730
+ volume id:3 size:106290280 collection:"test" file_count:34243 version:3 modified_at_second:1587215730
+ volume id:4 size:105815368 collection:"test" file_count:34090 version:3 modified_at_second:1587215730
+ volume id:5 size:105660168 collection:"test" file_count:34040 version:3 modified_at_second:1587215730
+ volume id:6 size:106296488 collection:"test" file_count:34245 version:3 modified_at_second:1587215730
+ volume id:7 size:105753288 collection:"test" file_count:34070 version:3 modified_at_second:1587215730
+ volume id:8 size:7746408 file_count:12 version:3 modified_at_second:1587215764
+ volume id:9 size:10438760 collection:"test" file_count:3363 version:3 modified_at_second:1587215788
+ volume id:10 size:10240104 collection:"test" file_count:3299 version:3 modified_at_second:1587215788
+ volume id:11 size:10258728 collection:"test" file_count:3305 version:3 modified_at_second:1587215788
+ volume id:12 size:10240104 collection:"test" file_count:3299 version:3 modified_at_second:1587215788
+ volume id:13 size:10112840 collection:"test" file_count:3258 version:3 modified_at_second:1587215788
+ volume id:14 size:10190440 collection:"test" file_count:3283 version:3 modified_at_second:1587215788
+ volume id:15 size:10112840 collection:"test" file_count:3258 version:3 modified_at_second:1587215788
+ DataNode localhost:8080 total size:820752408 file_count:261934
+ Rack DefaultRack total size:820752408 file_count:261934
+ DataCenter DefaultDataCenter total size:820752408 file_count:261934
+total size:820752408 file_count:261934
+```
+
+### Custom S3 benchmark
+
+To simulate the use case of S3 for 100-500M small files (grobid xml, pdftotext,
+...), I created a synthetic benchmark.
+
+* [https://gist.github.com/miku/6f3fee974ba82083325c2f24c912b47b](https://gist.github.com/miku/6f3fee974ba82083325c2f24c912b47b)
+
+We just try to fill up the datastore with millions of 5k blobs.
+
+----
+
+### testrun-1
+
+Small set, just to run. Status: done. Learned that the default in-memory volume
+index grows too quickly for the 4GB RAM machine.
+
+```
+$ weed server -dir /tmp/martin-seaweedfs-testrun-1 -s3 -volume.max 512 -master.volumeSizeLimitMB 100
+```
+
+* https://github.com/chrislusf/seaweedfs/issues/498 -- RAM
+* at 10M files, we already consume ~1G
+
+```
+-volume.index string
+ Choose [memory|leveldb|leveldbMedium|leveldbLarge] mode for memory~performance balance. (default "memory")
+```
+
+### testrun-2
+
+200M 5k objects, in-memory volume index. Status: done. Observed: After 18M
+objects the 512 100MB volumes are exhausted and seaweedfs will not accept any
+new data.
+
+```
+$ weed server -dir /tmp/martin-seaweedfs-testrun-2 -s3 -volume.max 512 -master.volumeSizeLimitMB 100
+...
+I0418 12:01:43 1622 volume_loading.go:104] loading index /tmp/martin-seaweedfs-testrun-2/test_511.idx to memory
+I0418 12:01:43 1622 store.go:122] add volume 511
+I0418 12:01:43 1622 volume_layout.go:243] Volume 511 becomes writable
+I0418 12:01:43 1622 volume_growth.go:224] Created Volume 511 on topo:DefaultDataCenter:DefaultRack:localhost:8080
+I0418 12:01:43 1622 master_grpc_server.go:158] master send to master@[::1]:45084: url:"localhost:8080" public_url:"localhost:8080" new_vids:511
+I0418 12:01:43 1622 master_grpc_server.go:158] master send to filer@::1:18888: url:"localhost:8080" public_url:"localhost:8080" new_vids:511
+I0418 12:01:43 1622 store.go:118] In dir /tmp/martin-seaweedfs-testrun-2 adds volume:512 collection:test replicaPlacement:000 ttl:
+I0418 12:01:43 1622 volume_loading.go:104] loading index /tmp/martin-seaweedfs-testrun-2/test_512.idx to memory
+I0418 12:01:43 1622 store.go:122] add volume 512
+I0418 12:01:43 1622 volume_layout.go:243] Volume 512 becomes writable
+I0418 12:01:43 1622 master_grpc_server.go:158] master send to master@[::1]:45084: url:"localhost:8080" public_url:"localhost:8080" new_vids:512
+I0418 12:01:43 1622 master_grpc_server.go:158] master send to filer@::1:18888: url:"localhost:8080" public_url:"localhost:8080" new_vids:512
+I0418 12:01:43 1622 volume_growth.go:224] Created Volume 512 on topo:DefaultDataCenter:DefaultRack:localhost:8080
+I0418 12:01:43 1622 node.go:82] topo failed to pick 1 from 0 node candidates
+I0418 12:01:43 1622 volume_growth.go:88] create 7 volume, created 2: No enough data node found!
+I0418 12:04:30 1622 volume_layout.go:231] Volume 511 becomes unwritable
+I0418 12:04:30 1622 volume_layout.go:231] Volume 512 becomes unwritable
+E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left!
+I0418 12:04:30 1622 filer_server_handlers_write.go:120] fail to allocate volume for /buckets/test/k43731970, collection:test, datacenter:
+E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left!
+E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left!
+E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left!
+E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left!
+I0418 12:04:30 1622 masterclient.go:88] filer failed to receive from localhost:9333: rpc error: code = Unavailable desc = transport is closing
+I0418 12:04:30 1622 master_grpc_server.go:276] - client filer@::1:18888
+```
+
+Inserted about 18M docs, then:
+
+```
+worker-0 @3720000 45475.13 81.80
+worker-1 @3730000 45525.00 81.93
+worker-3 @3720000 45525.76 81.71
+worker-4 @3720000 45527.22 81.71
+Process Process-1:
+Traceback (most recent call last):
+ File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
+ self.run()
+ File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
+ self._target(*self._args, **self._kwargs)
+ File "s3test.py", line 42, in insert_keys
+ s3.Bucket(bucket).put_object(Key=key, Body=data)
+ File "/home/martin/.virtualenvs/6f3fee974ba82083325c2f24c912b47b/lib/python3.5/site-packages/boto3/resources/factory.py", line 520, in do_action
+ response = action(self, *args, **kwargs)
+ File "/home/martin/.virtualenvs/6f3fee974ba82083325c2f24c912b47b/lib/python3.5/site-packages/boto3/resources/action.py", line 83, in __call__
+ response = getattr(parent.meta.client, operation_name)(**params)
+ File "/home/martin/.virtualenvs/6f3fee974ba82083325c2f24c912b47b/lib/python3.5/site-packages/botocore/client.py", line 316, in _api_call
+ return self._make_api_call(operation_name, kwargs)
+ File "/home/martin/.virtualenvs/6f3fee974ba82083325c2f24c912b47b/lib/python3.5/site-packages/botocore/client.py", line 626, in _make_api_call
+ raise error_class(parsed_response, operation_name)
+botocore.exceptions.ClientError: An error occurred (InternalError) when calling the PutObject operation (reached max retries: 4): We encountered an internal error, please try again.
+
+real 759m30.034s
+user 1962m47.487s
+sys 105m21.113s
+```
+
+Sustained 400 S3 puts/s, RAM usage 41% of a 4G machine. 56G on disk.
+
+> No free volumes left! Failed to allocate bucket for /buckets/test/k163721819
+
+### testrun-3
+
+* use leveldb, leveldbLarge
+* try "auto" volumes
+* Status: done. Observed: rapid memory usage increase.
+
+```
+$ weed server -dir /tmp/martin-seaweedfs-testrun-3 -s3 -volume.max 0 -volume.index=leveldbLarge -filer=false -master.volumeSizeLimitMB 100
+```
+
+Observations: memory usage grows rapidly, soon at 15%.
+
+Note-to-self: [https://github.com/chrislusf/seaweedfs/wiki/Optimization](https://github.com/chrislusf/seaweedfs/wiki/Optimization)
+
+### testrun-4
+
+The default volume size is 30G (and cannot be more at the moment), and RAM
+grows very much with the number of volumes. Therefore, keep default volume size
+and do not limit number of volumes `-volume.max 0` and do not use in-memory
+index (rather leveldb)
+
+Status: done, 200M object upload via Python script successfully in about 6 days,
+memory usage was at a moderate 400M (~10% of RAM). Relatively constant
+performance at about 400 `PutObject` requests/s (over 5 threads, each thread
+was around 80 requests/s; then testing with 4 threads, each thread got to
+around 100 requests/s).
+
+```
+$ weed server -dir /tmp/martin-seaweedfs-testrun-4 -s3 -volume.max 0 -volume.index=leveldb
+```
+
+The test script command was (40M files per worker, 5 workers).
+
+```
+$ time python s3test.py -n 40000000 -w 5 2> s3test.4.log
+...
+
+real 8454m33.695s
+user 21318m23.094s
+sys 1128m32.293s
+```
+
+The test script adds keys from `k0...k199999999`.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 ls s3://test | head -20
+2020-04-19 09:27:13 5000 k0
+2020-04-19 09:27:13 5000 k1
+2020-04-19 09:27:13 5000 k10
+2020-04-19 09:27:15 5000 k100
+2020-04-19 09:27:26 5000 k1000
+2020-04-19 09:29:15 5000 k10000
+2020-04-19 09:47:49 5000 k100000
+2020-04-19 12:54:03 5000 k1000000
+2020-04-20 20:14:10 5000 k10000000
+2020-04-22 07:33:46 5000 k100000000
+2020-04-22 07:33:46 5000 k100000001
+2020-04-22 07:33:46 5000 k100000002
+2020-04-22 07:33:46 5000 k100000003
+2020-04-22 07:33:46 5000 k100000004
+2020-04-22 07:33:46 5000 k100000005
+2020-04-22 07:33:46 5000 k100000006
+2020-04-22 07:33:46 5000 k100000007
+2020-04-22 07:33:46 5000 k100000008
+2020-04-22 07:33:46 5000 k100000009
+2020-04-20 20:14:10 5000 k10000001
+```
+
+Glance at stats.
+
+```
+$ du -hs /tmp/martin-seaweedfs-testrun-4
+596G /tmp/martin-seaweedfs-testrun-4
+
+$ find . /tmp/martin-seaweedfs-testrun-4 | wc -l
+5104
+
+$ ps --pid $(pidof weed) -o pid,tid,class,stat,vsz,rss,comm
+ PID TID CLS STAT VSZ RSS COMMAND
+32194 32194 TS Sl+ 1966964 491644 weed
+
+$ ls -1 /proc/$(pidof weed)/fd | wc -l
+192
+
+$ free -m
+ total used free shared buff/cache available
+Mem: 3944 534 324 39 3086 3423
+Swap: 4094 27 4067
+```
+
+### Note on restart
+
+When stopping (CTRL-C) and restarting `weed` it will take about 10 seconds to
+get the S3 API server back up, but another minute or two, until seaweedfs
+inspects all existing volumes and indices.
+
+In that gap, requests to S3 will look like internal server errors.
+
+```
+$ aws --endpoint-url http://localhost:8333 s3 cp s3://test/k100 -
+download failed: s3://test/k100 to - An error occurred (500) when calling the
+GetObject operation (reached max retries: 4): Internal Server Error
+```
+
+### Read benchmark
+
+Reading via command line `aws` client is a bit slow at first sight (3-5s).
+
+```
+$ time aws --endpoint-url http://localhost:8333 s3 cp s3://test/k123456789 -
+ppbhjgzkrrgwagmjsuwhqcwqzmefybeopqz [...]
+
+real 0m5.839s
+user 0m0.898s
+sys 0m0.293s
+```
+
+#### Single process random reads
+
+* via [s3read.go](https://gist.github.com/miku/6f3fee974ba82083325c2f24c912b47b#file-s3read-go)
+
+Running 1000 random reads takes 49s.
+
+#### Concurrent random reads
+
+* 80000 request with 8 parallel processes: 7m41.973968488s, so about 170 objects/s)
+* seen up to 760 keys/s reads for 8 workers
+* weed will utilize all cores, so more cpus could result in higher read throughput
+* RAM usage can increase (seen up to 20% of 4G RAM), then descrease (GC) back to 5%, depending on query load
diff --git a/proposals/2021-04-22_crossref_db.md b/proposals/2021-04-22_crossref_db.md
new file mode 100644
index 0000000..1d4c3f8
--- /dev/null
+++ b/proposals/2021-04-22_crossref_db.md
@@ -0,0 +1,86 @@
+
+status: deployed
+
+Crossref DOI Metadata in Sandcrawler DB
+=======================================
+
+Proposal is to have a local copy of Crossref API metadata records in
+sandcrawler DB, accessible by simple key lookup via postgrest.
+
+Initial goal is to include these in scholar work "bundles" (along with
+fulltext, etc), in particular as part of reference extraction pipeline. Around
+late 2020, many additional references became available via Crossref records,
+and have not been imported (updated) into fatcat. Reference storage in fatcat
+API is a scaling problem we would like to put off, so injecting content in this
+way is desirable.
+
+To start, working with a bulk dump made available by Crossref. In the future,
+might persist the daily feed to that we have a continuously up-to-date copy.
+
+Another application of Crossref-in-bundles is to identify overall scale of
+changes since initial Crossref metadata import.
+
+
+## Sandcrawler DB Schema
+
+The "updated" field in this case refers to the upstream timestamp, not the
+sandcrawler database update time.
+
+ CREATE TABLE IF NOT EXISTS crossref (
+ doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)),
+ indexed TIMESTAMP WITH TIME ZONE NOT NULL,
+ record JSON NOT NULL,
+ PRIMARY KEY(doi)
+ );
+
+For postgrest access, may need to also:
+
+ GRANT SELECT ON public.crossref TO web_anon;
+
+## SQL Backfill Command
+
+For an example file:
+
+ cat sample.json \
+ | jq -rc '[(.DOI | ascii_downcase), .indexed."date-time", (. | tostring)] | @tsv' \
+ | psql sandcrawler -c "COPY crossref (doi, indexed, record) FROM STDIN (DELIMITER E'\t');"
+
+For a full snapshot:
+
+ zcat crossref_public_data_file_2021_01.json.gz \
+ | pv -l \
+ | jq -rc '[(.DOI | ascii_downcase), .indexed."date-time", (. | tostring)] | @tsv' \
+ | psql sandcrawler -c "COPY crossref (doi, indexed, record) FROM STDIN (DELIMITER E'\t');"
+
+jq is the bottleneck (100% of a single CPU core).
+
+## Kafka Worker
+
+Pulls from the fatcat crossref ingest Kafka feed and persists into the crossref
+table.
+
+## SQL Table Disk Utilization
+
+An example backfill from early 2021, with about 120 million Crossref DOI
+records.
+
+Starting database size (with ingest running):
+
+ Filesystem Size Used Avail Use% Mounted on
+ /dev/vdb1 1.7T 896G 818G 53% /1
+
+ Size: 475.14G
+
+Ingest SQL command took:
+
+ 120M 15:06:08 [2.22k/s]
+ COPY 120684688
+
+After database size:
+
+ Filesystem Size Used Avail Use% Mounted on
+ /dev/vdb1 1.7T 1.2T 498G 71% /1
+
+ Size: 794.88G
+
+So about 320 GByte of disk.
diff --git a/proposals/2021-09-09_component_ingest.md b/proposals/2021-09-09_component_ingest.md
new file mode 100644
index 0000000..09dee4f
--- /dev/null
+++ b/proposals/2021-09-09_component_ingest.md
@@ -0,0 +1,114 @@
+
+File Ingest Mode: 'component'
+=============================
+
+A new ingest type for downloading individual files which are a subset of a
+complete work.
+
+Some publishers now assign DOIs to individual figures, supplements, and other
+"components" of an over release or document.
+
+Initial mimetypes to allow:
+
+- image/jpeg
+- image/tiff
+- image/png
+- image/gif
+- audio/mpeg
+- video/mp4
+- video/mpeg
+- text/plain
+- text/csv
+- application/json
+- application/xml
+- application/pdf
+- application/gzip
+- application/x-bzip
+- application/x-bzip2
+- application/zip
+- application/x-rar
+- application/x-7z-compressed
+- application/x-tar
+- application/vnd.ms-powerpoint
+- application/vnd.ms-excel
+- application/msword
+- application/vnd.openxmlformats-officedocument.wordprocessingml.document
+- application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
+
+Intentionally not supporting:
+
+- text/html
+
+
+## Fatcat Changes
+
+In the file importer, allow the additional mimetypes for 'component' ingest.
+
+
+## Ingest Changes
+
+Allow additional terminal mimetypes for 'component' crawls.
+
+
+## Examples
+
+Hundreds of thousands: <https://fatcat.wiki/release/search?q=type%3Acomponent+in_ia%3Afalse>
+
+#### ACS Supplement File
+
+<https://doi.org/10.1021/acscatal.0c02627.s002>
+
+Redirects directly to .zip in browser. SPN is blocked by cookie check.
+
+#### Frontiers .docx Supplement
+
+<https://doi.org/10.3389/fpls.2019.01642.s001>
+
+Redirects to full article page. There is a pop-up for figshare, seems hard to process.
+
+#### Figshare Single FIle
+
+<https://doi.org/10.6084/m9.figshare.13646972.v1>
+
+As 'component' type in fatcat.
+
+Redirects to a landing page. Dataset ingest seems more appropriate for this entire domain.
+
+#### PeerJ supplement file
+
+<https://doi.org/10.7717/peerj.10257/supp-7>
+
+PeerJ is hard because it redirects to a single HTML page, which has links to
+supplements in the HTML. Perhaps a custom extractor will work.
+
+#### eLife
+
+<https://doi.org/10.7554/elife.38407.010>
+
+The current crawl mechanism makes it seemingly impossible to extract a specific
+supplement from the document as a whole.
+
+#### Zookeys
+
+<https://doi.org/10.3897/zookeys.895.38576.figure53>
+
+These are extract-able.
+
+#### OECD PDF Supplement
+
+<https://doi.org/10.1787/f08c6324-en>
+<https://www.oecd-ilibrary.org/trade/imports-of-services-billions-of-us-dollars_f08c6324-en>
+
+Has an Excel (.xls) link, great, but then paywall.
+
+#### Direct File Link
+
+<https://doi.org/10.1787/888934207500>
+
+This one is also OECD, but is a simple direct download.
+
+#### Protein Data Base (PDB) Entry
+
+<https://doi.org/10.2210/pdb6ls2/pdb>
+
+Multiple files; dataset/fileset more appropriate for these.
diff --git a/proposals/2021-09-09_fileset_ingest.md b/proposals/2021-09-09_fileset_ingest.md
new file mode 100644
index 0000000..65c9ccf
--- /dev/null
+++ b/proposals/2021-09-09_fileset_ingest.md
@@ -0,0 +1,343 @@
+
+status: implemented
+
+Fileset Ingest Pipeline (for Datasets)
+======================================
+
+Sandcrawler currently has ingest support for individual files saved as `file`
+entities in fatcat (xml and pdf ingest types) and HTML files with
+sub-components saved as `webcapture` entities in fatcat (html ingest type).
+
+This document describes extensions to this ingest system to flexibly support
+groups of files, which may be represented in fatcat as `fileset` entities. The
+main new ingest type is `dataset`.
+
+Compared to the existing ingest process, there are two major complications with
+datasets:
+
+- the ingest process often requires more than parsing HTML files, and will be
+ specific to individual platforms and host software packages
+- the storage backend and fatcat entity type is flexible: a dataset might be
+ represented by a single file, multiple files combined in to a single .zip
+ file, or multiple separate files; the data may get archived in wayback or in
+ an archive.org item
+
+The new concepts of "strategy" and "platform" are introduced to accommodate
+these complications.
+
+
+## Ingest Strategies
+
+The ingest strategy describes the fatcat entity type that will be output; the
+storage backend used; and whether an enclosing file format is used. The
+strategy to use can not be determined until the number and size of files is
+known. It is a function of file count, total file size, and publication
+platform.
+
+Strategy names are compact strings with the format
+`{storage_backend}-{fatcat_entity}`. A `-bundled` suffix after a `fileset`
+entity type indicates that metadata about multiple files is retained, but that
+in the storage backend only a single enclosing file (eg, `.zip`) will be
+stored.
+
+The supported strategies are:
+
+- `web-file`: single file of any type, stored in wayback, represented as fatcat `file`
+- `web-fileset`: multiple files of any type, stored in wayback, represented as fatcat `fileset`
+- `web-fileset-bundled`: single bundle file, stored in wayback, represented as fatcat `fileset`
+- `archiveorg-file`: single file of any type, stored in archive.org item, represented as fatcat `file`
+- `archiveorg-fileset`: multiple files of any type, stored in archive.org item, represented as fatcat `fileset`
+- `archiveorg-fileset-bundled`: single bundle file, stored in archive.org item, represented as fatcat `fileset`
+
+"Bundle" or "enclosing" files are things like .zip or .tar.gz. Not all .zip
+files are handled as bundles! Only when the transfer from the hosting platform
+is via a "download all as .zip" (or similar) do we consider a zipfile a
+"bundle" and index the interior files as a fileset.
+
+The term "bundle file" is used over "archive file" or "container file" to
+prevent confusion with the other use of those terms in the context of fatcat
+(container entities; archive; Internet Archive as an organization).
+
+The motivation for supporting both `web` and `archiveorg` is that `web` is
+somewhat simpler for small files, but `archiveorg` is better for larger groups
+of files (say more than 20) and larger total size (say more than 1 GByte total,
+or 128 MByte for any one file).
+
+The motivation for supporting "bundled" filesets is that there is only a single
+file to archive.
+
+
+## Ingest Pseudocode
+
+1. Determine `platform`, which may involve resolving redirects and crawling a landing page.
+
+ a. currently we always crawl the ingest `base_url`, capturing a platform landing page
+ b. we don't currently handle the case of `base_url` leading to a non-HTML
+ terminal resource. the `component` ingest type does handle this
+
+2. Use platform-specific methods to fetch manifest metadata and decide on an `ingest_strategy`.
+
+ a. depending on platform, may include access URLs for multiple strategies
+ (eg, URL for each file and a bundle URL), metadata about the item for, eg,
+ archive.org item upload, etc
+
+3. Use strategy-specific methods to archive all files in platform manifest, and verify manifest metadata.
+
+4. Summarize status and return structured result metadata.
+
+ a. if the strategy was `web-file` or `archiveorg-file`, potentially submit an
+ `ingest_file_result` object down the file ingest pipeline (Kafka topic and
+ later persist and fatcat import workers), with `dataset-file` ingest
+ type (or `{ingest_type}-file` more generally).
+
+New python types:
+
+ FilesetManifestFile
+ path: str
+ size: Optional[int]
+ md5: Optional[str]
+ sha1: Optional[str]
+ sha256: Optional[str]
+ mimetype: Optional[str]
+ extra: Optional[Dict[str, Any]]
+
+ status: Optional[str]
+ platform_url: Optional[str]
+ terminal_url: Optional[str]
+ terminal_dt: Optional[str]
+
+ FilesetPlatformItem
+ platform_name: str
+ platform_status: str
+ platform_domain: Optional[str]
+ platform_id: Optional[str]
+ manifest: Optional[List[FilesetManifestFile]]
+ archiveorg_item_name: Optional[str]
+ archiveorg_item_meta
+ web_base_url
+ web_bundle_url
+
+ ArchiveStrategyResult
+ ingest_strategy: str
+ status: str
+ manifest: List[FilesetManifestFile]
+ file_file_meta: Optional[dict]
+ file_terminal: Optional[dict]
+ file_cdx: Optional[dict]
+ bundle_file_meta: Optional[dict]
+ bundle_terminal: Optional[dict]
+ bundle_cdx: Optional[dict]
+ bundle_archiveorg_path: Optional[dict]
+
+New python APIs/classes:
+
+ FilesetPlatformHelper
+ match_request(request, resource, html_biblio) -> bool
+ does the request and landing page metadata indicate a match for this platform?
+ process_request(request, resource, html_biblio) -> FilesetPlatformItem
+ do API requests, parsing, etc to fetch metadata and access URLs for this fileset/dataset. platform-specific
+ chose_strategy(item: FilesetPlatformItem) -> IngestStrategy
+ select an archive strategy for the given fileset/dataset
+
+ FilesetIngestStrategy
+ check_existing(item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]
+ check the given backend for an existing capture/archive; if found, return result
+ process(item: FilesetPlatformItem) -> ArchiveStrategyResult
+ perform an actual archival capture
+
+## Limits and Failure Modes
+
+- `too-large-size`: total size of the fileset is too large for archiving.
+ initial limit is 64 GBytes, controlled by `max_total_size` parameter.
+- `too-many-files`: number of files (and thus file-level metadata) is too
+ large. initial limit is 200, controlled by `max_file_count` parameter.
+- `platform-scope / FilesetPlatformScopeError`: for when `base_url` leads to a
+ valid platform, which could be found via API or parsing, but has the wrong
+ scope. Eg, tried to fetch a dataset, but got a DOI which represents all
+ versions of the dataset, not a specific version.
+- `platform-restricted`/`PlatformRestrictedError`: for, eg, embargoes
+- `platform-404`: got to a landing page, and seemed like in-scope, but no
+ platform record found anyways
+
+
+## New Sandcrawler Code and Worker
+
+ sandcrawler-ingest-fileset-worker@{1..6} (or up to 1..12 later)
+
+Worker consumes from ingest request topic, produces to fileset ingest results,
+and optionally produces to file ingest results.
+
+ sandcrawler-persist-ingest-fileset-worker@1
+
+Simply writes fileset ingest rows to SQL.
+
+
+## New Fatcat Worker and Code Changes
+
+ fatcat-import-ingest-fileset-worker
+
+This importer is modeled on file and web worker. Filters for `success` with
+strategy of `*-fileset*`.
+
+Existing `fatcat-import-ingest-file-worker` should be updated to allow
+`dataset` single-file imports, with largely same behavior and semantics as
+current importer (`component` mode).
+
+Existing fatcat transforms, and possibly even elasticsearch schemas, should be
+updated to include fileset status and `in_ia` flag for dataset type releases.
+
+Existing entity updates worker submits `dataset` type ingests to ingest request
+topic.
+
+
+## Ingest Result Schema
+
+Common with file results, and mostly relating to landing page HTML:
+
+ hit: bool
+ status: str
+ success
+ success-existing
+ success-file (for `web-file` or `archiveorg-file` only)
+ request: object
+ terminal: object
+ file_meta: object
+ cdx: object
+ revisit_cdx: object
+ html_biblio: object
+
+Additional fileset-specific fields:
+
+ manifest: list of objects
+ platform_name: str
+ platform_domain: str
+ platform_id: str
+ platform_base_url: str
+ ingest_strategy: str
+ archiveorg_item_name: str (optional, only for `archiveorg-*` strategies)
+ file_count: int
+ total_size: int
+ fileset_bundle (optional, only for `*-fileset-bundle` strategy)
+ file_meta
+ cdx
+ revisit_cdx
+ terminal
+ archiveorg_bundle_path
+ fileset_file (optional, only for `*-file` strategy)
+ file_meta
+ terminal
+ cdx
+ revisit_cdx
+
+If the strategy was `web-file` or `archiveorg-file` and the status is
+`success-file`, then an ingest file result will also be published to
+`sandcrawler-ENV.ingest-file-results`, using the same ingest type and fields as
+regular ingest.
+
+
+All fileset ingest results get published to ingest-fileset-result.
+
+Existing sandcrawler persist workers also subscribe to this topic and persist
+status and landing page terminal info to tables just like with file ingest.
+GROBID, HTML, and other metadata is not persisted in this path.
+
+If the ingest strategy was a single file (`*-file`), then an ingest file is
+also published to the ingest-file-result topic, with the `fileset_file`
+metadata, and ingest type `dataset-file`. This should only happen on success
+condition.
+
+
+## New SQL Tables
+
+Note that this table *complements* `ingest_file_result`, doesn't replace it.
+`ingest_file_result` could more accurately be called `ingest_result`.
+
+ CREATE TABLE IF NOT EXISTS ingest_fileset_platform (
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ hit BOOLEAN NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1),
+
+ platform_name TEXT NOT NULL CHECK (octet_length(platform_name) >= 1),
+ platform_domain TEXT NOT NULL CHECK (octet_length(platform_domain) >= 1),
+ platform_id TEXT NOT NULL CHECK (octet_length(platform_id) >= 1),
+ ingest_strategy TEXT CHECK (octet_length(ingest_strategy) >= 1),
+ total_size BIGINT,
+ file_count BIGINT,
+ archiveorg_item_name TEXT CHECK (octet_length(archiveorg_item_name) >= 1),
+
+ archiveorg_item_bundle_path TEXT CHECK (octet_length(archiveorg_item_bundle_path) >= 1),
+ web_bundle_url TEXT CHECK (octet_length(web_bundle_url) >= 1),
+ web_bundle_dt TEXT CHECK (octet_length(web_bundle_dt) = 14),
+
+ manifest JSONB,
+ -- list, similar to fatcat fileset manifest, plus extra:
+ -- status (str)
+ -- path (str)
+ -- size (int)
+ -- md5 (str)
+ -- sha1 (str)
+ -- sha256 (str)
+ -- mimetype (str)
+ -- extra (dict)
+ -- platform_url (str)
+ -- terminal_url (str)
+ -- terminal_dt (str)
+
+ PRIMARY KEY (ingest_type, base_url)
+ );
+ CREATE INDEX ingest_fileset_platform_name_domain_id_idx ON ingest_fileset_platform(platform_name, platform_domain, platform_id);
+
+Persist worker should only insert in to this table if `platform_name` is
+identified.
+
+## New Kafka Topic
+
+ sandcrawler-ENV.ingest-fileset-results 6x, no retention limit
+
+
+## Implementation Plan
+
+First implement ingest worker, including platform and strategy helpers, and
+test those as simple stdin/stdout CLI tools in sandcrawler repo to validate
+this proposal.
+
+Second implement fatcat importer and test locally and/or in QA.
+
+Lastly implement infrastructure, automation, and other "glue":
+
+- SQL schema
+- persist worker
+
+
+## Design Note: Single-File Datasets
+
+Should datasets and other groups of files which only contain a single file get
+imported as a fatcat `file` or `fileset`? This can be broken down further as
+documents (single PDF) vs other individual files.
+
+Advantages of `file`:
+
+- handles case of article PDFs being marked as dataset accidentally
+- `file` entities get de-duplicated with simple lookup (eg, on `sha1`)
+- conceptually simpler if individual files are `file` entity
+- easier to download individual files
+
+Advantages of `fileset`:
+
+- conceptually simpler if all `dataset` entities have `fileset` form factor
+- code path is simpler: one fewer strategy, and less complexity of sending
+ files down separate import path
+- metadata about platform is retained
+- would require no modification of existing fatcat file importer
+- fatcat import of archive.org of `file` is not actually implemented yet?
+
+Decision is to do individual files. Fatcat fileset import worker should reject
+single-file (and empty) manifest filesets. Fatcat file import worker should
+accept all mimetypes for `dataset-file` (similar to `component`).
+
+
+## Example Entities
+
+See `notes/dataset_examples.txt`
diff --git a/proposals/2021-09-13_src_ingest.md b/proposals/2021-09-13_src_ingest.md
new file mode 100644
index 0000000..470827a
--- /dev/null
+++ b/proposals/2021-09-13_src_ingest.md
@@ -0,0 +1,53 @@
+
+File Ingest Mode: 'src'
+=======================
+
+Ingest type for "source" of works in document form. For example, tarballs of
+LaTeX source and figures, as published on arxiv.org and Pubmed Central.
+
+For now, presumption is that this would be a single file (`file` entity in
+fatcat).
+
+Initial mimetypes to allow:
+
+- text/x-tex
+- application/xml
+- application/gzip
+- application/x-bzip
+- application/x-bzip2
+- application/zip
+- application/x-tar
+- application/msword
+- application/vnd.openxmlformats-officedocument.wordprocessingml.document
+
+
+## Fatcat Changes
+
+In the file importer, allow the additional mimetypes for 'src' ingest.
+
+Might keep ingest disabled on the fatcat side, at least initially. Eg, until
+there is some scope of "file scope", or other ways of treating 'src' tarballs
+separate from PDFs or other fulltext formats.
+
+
+## Ingest Changes
+
+Allow additional terminal mimetypes for 'src' crawls.
+
+
+## Examples
+
+ arxiv:2109.00954v1
+ fatcat:release_akzp2lgqjbcbhpoeoitsj5k5hy
+ https://arxiv.org/format/2109.00954v1
+ https://arxiv.org/e-print/2109.00954v1
+
+ arxiv:1912.03397v2
+ https://arxiv.org/format/1912.03397v2
+ https://arxiv.org/e-print/1912.03397v2
+ NOT: https://arxiv.org/pdf/1912.03397v2
+
+ pmcid:PMC3767916
+ https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/03/PMC3767916.tar.gz
+
+For PMC, will need to use one of the .csv file lists to get the digit prefixes.
diff --git a/proposals/2021-09-21_spn_accounts.md b/proposals/2021-09-21_spn_accounts.md
new file mode 100644
index 0000000..e41c162
--- /dev/null
+++ b/proposals/2021-09-21_spn_accounts.md
@@ -0,0 +1,14 @@
+
+Formalization of SPNv2 API requests from fatcat/sandcrawler
+
+Create two new system accounts, one for regular/daily ingest requests, one for
+priority requests (save-paper-now or as a flag with things like fatcat-ingest;
+"interactive"). These accounts should have @archive.org emails. Request the
+daily one to have the current rate limit as bnewbold@archive.org account; the
+priority queue can have less.
+
+Create new ingest kafka queues from scratch, one for priority and one for
+regular. Chose sizes carefully, probably keep 24x for the regular and do 6x or
+so (small) for priority queue.
+
+Deploy new priority workers; reconfigure/deploy broadly.
diff --git a/proposals/2021-10-28_grobid_refs.md b/proposals/2021-10-28_grobid_refs.md
new file mode 100644
index 0000000..1fc79b6
--- /dev/null
+++ b/proposals/2021-10-28_grobid_refs.md
@@ -0,0 +1,125 @@
+
+GROBID References in Sandcrawler DB
+===================================
+
+Want to start processing "unstructured" raw references coming from upstream
+metadata sources (distinct from upstream fulltext sources, like PDFs or JATS
+XML), and save the results in sandcrawler DB. From there, they will get pulled
+in to fatcat-scholar "intermediate bundles" and included in reference exports.
+
+The initial use case for this is to parse "unstructured" references deposited
+in Crossref, and include them in refcat.
+
+
+## Schema and Semantics
+
+The output JSON/dict schema for parsed references follows that of
+`grobid_tei_xml` version 0.1.x, for the `GrobidBiblio` field. The
+`unstructured` field that was parsed is included in the output, though it may
+not be byte-for-byte exact (see below). One notable change from the past (eg,
+older GROBID-parsed references) is that author `name` is now `full_name`. New
+fields include `editors` (same schema as `authors`), `book_title`, and
+`series_title`.
+
+The overall output schema matches that of the `grobid_refs` SQL table:
+
+ source: string, lower-case. eg 'crossref'
+ source_id: string, eg '10.1145/3366650.3366668'
+ source_ts: optional timestamp (full ISO datetime with timezone (eg, `Z`
+ suffix), which identifies version of upstream metadata
+ refs_json: JSON, list of `GrobidBiblio` JSON objects
+
+References are re-processed on a per-article (or per-release) basis. All the
+references for an article are handled as a batch and output as a batch. If
+there are no upstream references, row with `ref_json` as empty list may be
+returned.
+
+Not all upstream references get re-parsed, even if an 'unstructured' field is
+available. If 'unstructured' is not available, no row is ever output. For
+example, if a reference includes `unstructured` (raw citation string), but also
+has structured metadata for authors, title, year, and journal name, we might
+not re-parse the `unstructured` string. Whether to re-parse is evaulated on a
+per-reference basis. This behavior may change over time.
+
+`unstructured` strings may be pre-processed before being submitted to GROBID.
+This is because many sources have systemic encoding issues. GROBID itself may
+also do some modification of the input citation string before returning it in
+the output. This means the `unstructured` string is not a reliable way to map
+between specific upstream references and parsed references. Instead, the `id`
+field (str) of `GrobidBiblio` gets set to any upstream "key" or "index"
+identifier used to track individual references. If there is only a numeric
+index, the `id` is that number as a string.
+
+The `key` or `id` may need to be woven back in to the ref objects manually,
+because GROBID `processCitationList` takes just a list of raw strings, with no
+attached reference-level key or id.
+
+
+## New SQL Table and View
+
+We may want to do re-parsing of references from sources other than `crossref`,
+so there is a generic `grobid_refs` table. But it is also common to fetch both
+the crossref metadata and any re-parsed references together, so as a convenience
+there is a PostgreSQL view (virtual table) that includes both a crossref
+metadata record and parsed citations, if available. If downstream code cares a
+lot about having the refs and record be in sync, the `source_ts` field on
+`grobid_refs` can be matched against the `indexed` column of `crossref` (or the
+`.indexed.date-time` JSON field in the record itself).
+
+Remember that DOIs should always be lower-cased before querying, inserting,
+comparing, etc.
+
+ CREATE TABLE IF NOT EXISTS grobid_refs (
+ source TEXT NOT NULL CHECK (octet_length(source) >= 1),
+ source_id TEXT NOT NULL CHECK (octet_length(source_id) >= 1),
+ source_ts TIMESTAMP WITH TIME ZONE,
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ refs_json JSON NOT NULL,
+ PRIMARY KEY(source, source_id)
+ );
+
+ CREATE OR REPLACE VIEW crossref_with_refs (doi, indexed, record, source_ts, refs_json) AS
+ SELECT
+ crossref.doi as doi,
+ crossref.indexed as indexed,
+ crossref.record as record,
+ grobid_refs.source_ts as source_ts,
+ grobid_refs.refs_json as refs_json
+ FROM crossref
+ LEFT JOIN grobid_refs ON
+ grobid_refs.source_id = crossref.doi
+ AND grobid_refs.source = 'crossref';
+
+Both `grobid_refs` and `crossref_with_refs` will be exposed through postgrest.
+
+
+## New Workers / Tools
+
+For simplicity, to start, a single worker with consume from
+`fatcat-prod.api-crossref`, process citations with GROBID (if necessary), and
+insert to both `crossref` and `grobid_refs` tables. This worker will run
+locally on the machine with sandcrawler-db.
+
+Another tool will support taking large chunks of Crossref JSON (as lines),
+filter them, process with GROBID, and print JSON to stdout, in the
+`grobid_refs` JSON schema.
+
+
+## Task Examples
+
+Command to process crossref records with refs tool:
+
+ cat crossref_sample.json \
+ | parallel -j5 --linebuffer --round-robin --pipe ./grobid_tool.py parse-crossref-refs - \
+ | pv -l \
+ > crossref_sample.parsed.json
+
+ # => 10.0k 0:00:27 [ 368 /s]
+
+Load directly in to postgres (after tables have been created):
+
+ cat crossref_sample.parsed.json \
+ | jq -rc '[.source, .source_id, .source_ts, (.refs_json | tostring)] | @tsv' \
+ | psql sandcrawler -c "COPY grobid_refs (source, source_id, source_ts, refs_json) FROM STDIN (DELIMITER E'\t');"
+
+ # => COPY 9999
diff --git a/proposals/2021-12-09_trawling.md b/proposals/2021-12-09_trawling.md
new file mode 100644
index 0000000..33b6b4c
--- /dev/null
+++ b/proposals/2021-12-09_trawling.md
@@ -0,0 +1,180 @@
+
+status: work-in-progress
+
+NOTE: as of December 2022, the implementation on these features haven't been
+merged to the main branch. Development stalled in December 2021.
+
+Trawling for Unstructured Scholarly Web Content
+===============================================
+
+## Background and Motivation
+
+A long-term goal for sandcrawler has been the ability to pick through
+unstructured web archive content (or even non-web collection), identify
+potential in-scope research outputs, extract metadata for those outputs, and
+merge the content in to a catalog (fatcat).
+
+This process requires integration of many existing tools (HTML and PDF
+extraction; fuzzy bibliographic metadata matching; machine learning to identify
+in-scope content; etc), as well as high-level curration, targetting, and
+evaluation by human operators. The goal is to augment and improve the
+productivity of human operators as much as possible.
+
+This process will be similar to "ingest", which is where we start with a
+specific URL and have some additional context about the expected result (eg,
+content type, exernal identifier). Some differences with trawling are that we
+are start with a collection or context (instead of single URL); have little or
+no context about the content we are looking for; and may even be creating a new
+catalog entry, as opposed to matching to a known existing entry.
+
+
+## Architecture
+
+The core operation is to take a resource and run a flowchart of processing
+steps on it, resulting in an overall status and possible related metadata. The
+common case is that the resource is a PDF or HTML coming from wayback (with
+contextual metadata about the capture), but we should be flexible to supporting
+more content types in the future, and should try to support plain files with no
+context as well.
+
+Some relatively simple wrapper code handles fetching resources and summarizing
+status/counts.
+
+Outside of the scope of sandcrawler, new fatcat code (importer or similar) will
+be needed to handle trawl results. It will probably make sense to pre-filter
+(with `jq` or `rg`) before passing results to fatcat.
+
+At this stage, trawl workers will probably be run manually. Some successful
+outputs (like GROBID, HTML metadata) would be written to existing kafka topics
+to be persisted, but there would not be any specific `trawl` SQL tables or
+automation.
+
+It will probably be helpful to have some kind of wrapper script that can run
+sandcrawler trawl processes, then filter and pipe the output into fatcat
+importer, all from a single invocation, while reporting results.
+
+TODO:
+- for HTML imports, do we fetch the full webcapture stuff and return that?
+
+
+## Methods of Operation
+
+### `cdx_file`
+
+An existing CDX file is provided on-disk locally.
+
+### `cdx_api`
+
+Simplified variants: `cdx_domain`, `cdx_surt`
+
+Uses CDX API to download records matching the configured filters, then processes the file.
+
+Saves the CDX file intermediate result somewhere locally (working or tmp
+directory), with timestamp in the path, to make re-trying with `cdx_file` fast
+and easy.
+
+
+### `archiveorg_web_collection`
+
+Uses `cdx_collection.py` (or similar) to fetch a full CDX list, by iterating over
+then process it.
+
+Saves the CDX file intermediate result somewhere locally (working or tmp
+directory), with timestamp in the path, to make re-trying with `cdx_file` fast
+and easy.
+
+### Others
+
+- `archiveorg_file_collection`: fetch file list via archive.org metadata, then processes each
+
+## Schema
+
+Per-resource results:
+
+ hit (bool)
+ indicates whether resource seems in scope and was processed successfully
+ (roughly, status 'success', and
+ status (str)
+ success: fetched resource, ran processing, pa
+ skip-cdx: filtered before even fetching resource
+ skip-resource: filtered after fetching resource
+ wayback-error (etc): problem fetching
+ content_scope (str)
+ filtered-{filtertype}
+ article (etc)
+ landing-page
+ resource_type (str)
+ pdf, html
+ file_meta{}
+ cdx{}
+ revisit_cdx{}
+
+ # below are resource_type specific
+ grobid
+ pdf_meta
+ pdf_trio
+ html_biblio
+ (other heuristics and ML)
+
+High-level request:
+
+ trawl_method: str
+ cdx_file_path
+ default_filters: bool
+ resource_filters[]
+ scope: str
+ surt_prefix, domain, host, mimetype, size, datetime, resource_type, http_status
+ value: any
+ values[]: any
+ min: any
+ max: any
+ biblio_context{}: set of expected/default values
+ container_id
+ release_type
+ release_stage
+ url_rel
+
+High-level summary / results:
+
+ status
+ request{}: the entire request object
+ counts
+ total_resources
+ status{}
+ content_scope{}
+ resource_type{}
+
+## Example Corpuses
+
+All PDFs (`application/pdf`) in web.archive.org from before the year 2000.
+Starting point would be a CDX list.
+
+Spidering crawls starting from a set of OA journal homepage URLs.
+
+Archive-It partner collections from research universities, particularly of
+their own .edu domains. Starting point would be an archive.org collection, from
+which WARC files or CDX lists can be accessed.
+
+General archive.org PDF collections, such as
+[ERIC](https://archive.org/details/ericarchive) or
+[Document Cloud](https://archive.org/details/documentcloud).
+
+Specific Journal or Publisher URL patterns. Starting point could be a domain,
+hostname, SURT prefix, and/or URL regex.
+
+Heuristic patterns over full web.archive.org CDX index. For example, .edu
+domains with user directories and a `.pdf` in the file path ("tilde" username
+pattern).
+
+Random samples of entire Wayback corpus. For example, random samples filtered
+by date, content type, TLD, etc. This would be true "trawling" over the entire
+corpus.
+
+
+## Other Ideas
+
+Could have a web archive spidering mode: starting from a seed, fetch multiple
+captures (different captures), then extract outlinks from those, up to some
+number of hops. An example application would be links to research group
+webpages or author homepages, and to try to extract PDF links from CVs, etc.
+
diff --git a/proposals/brainstorm/2021-debug_web_interface.md b/proposals/brainstorm/2021-debug_web_interface.md
new file mode 100644
index 0000000..442b439
--- /dev/null
+++ b/proposals/brainstorm/2021-debug_web_interface.md
@@ -0,0 +1,9 @@
+
+status: brainstorm idea
+
+Simple internal-only web interface to help debug ingest issues.
+
+- paste a hash, URL, or identifier and get a display of "everything we know" about it
+- enter a URL/SURT prefix and get aggregate stats (?)
+- enter a domain/host/prefix and get recent attempts/results
+- pre-computed periodic reports on ingest pipeline (?)
diff --git a/proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md b/proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md
new file mode 100644
index 0000000..b3ad447
--- /dev/null
+++ b/proposals/brainstorm/2022-04-18_automated_heritrix_crawling.md
@@ -0,0 +1,36 @@
+
+status: brainstorming
+
+We continue to see issues with heritrix3-based crawling. Would like to have an
+option to switch to higher-throughput heritrix-based crawling.
+
+SPNv2 path would stick around at least for save-paper-now style ingest.
+
+
+## Sketch
+
+Ingest requests are created continuously by fatcat, with daily spikes.
+
+Ingest workers run mostly in "bulk" mode, aka they don't make SPNv2 calls.
+`no-capture` responses are recorded in sandcrawler SQL database.
+
+Periodically (daily?), a script queries for new no-capture results, filtered to
+the most recent period. These are processed in a bit in to a URL list, then
+converted to a heritrix frontier, and sent to crawlers. This could either be an
+h3 instance (?), or simple `scp` to a running crawl directory.
+
+The crawler crawls, with usual landing page config, and draintasker runs.
+
+TODO: can we have draintasker/heritrix set a maximum WARC life? Like 6 hours?
+or, target a smaller draintasker item size, so they get updated more frequently
+
+Another SQL script dumps ingest requests from the *previous* period, and
+re-submits them for bulk-style ingest (by workers).
+
+The end result would be things getting crawled and updated within a couple
+days.
+
+
+## Sketch 2
+
+Upload URL list to petabox item, wait for heritrix derive to run (!)
diff --git a/proposals/schema_changes.sql b/proposals/schema_changes.sql
new file mode 100644
index 0000000..e18d051
--- /dev/null
+++ b/proposals/schema_changes.sql
@@ -0,0 +1,40 @@
+
+-- file_meta: more NOT NULL
+CREATE TABLE IF NOT EXISTS file_meta (
+ sha1hex TEXT NOT NULL PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ sha256hex TEXT NOT NULL CHECK (octet_length(sha256hex) = 64),
+ md5hex TEXT NOT NULL CHECK (octet_length(md5hex) = 32),
+ size_bytes BIGINT NOT NULL,
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1)
+);
+
+-- CDX: add domain/host columns?
+CREATE TABLE IF NOT EXISTS cdx (
+ url TEXT NOT NULL CHECK (octet_length(url) >= 1),
+ datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14),
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1),
+ warc_path TEXT CHECK (octet_length(warc_path) >= 1),
+ warc_csize BIGINT,
+ warc_offset BIGINT,
+ row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ domain TEXT NOT NULL CHECK (octet_length(domain) >= 1),
+ host TEXT NOT NULL CHECK (octet_length(host) >= 1),
+ PRIMARY KEY(url, datetime)
+);
+CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
+CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
+
+-- direct fast import with just md5hex; big UPDATE via join with file_meta
+CREATE TABLE IF NOT EXISTS shadow (
+ shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
+ shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1),
+ sha1hex TEXT CHECK (octet_length(sha1hex) = 40),
+ md5hex TEXT CHECK (octet_length(md5hex) = 32),
+ doi TEXT CHECK (octet_length(doi) >= 1),
+ pmid TEXT CHECK (octet_length(pmid) >= 1),
+ isbn13 TEXT CHECK (octet_length(isbn13) >= 1),
+ PRIMARY KEY(shadow_corpus, shadow_id)
+);
+CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex);
diff --git a/python/.coveragerc b/python/.coveragerc
index 6235f57..51038d6 100644
--- a/python/.coveragerc
+++ b/python/.coveragerc
@@ -1,3 +1,4 @@
[run]
omit = tests/*
-source = .
+source =
+ sandcrawler
diff --git a/python/.flake8 b/python/.flake8
new file mode 100644
index 0000000..c7ef5fe
--- /dev/null
+++ b/python/.flake8
@@ -0,0 +1,21 @@
+[flake8]
+select = C,E,F,W,ANN
+# ANN003 is annotation on, eg, **kwargs
+# ANN101 is annotation on 'self' (why would that be wanted?)
+# ANN204 is annotation on '__init__()'
+# ANN401 is 'Any' type
+# E265,E266 are restrictions on comments ('#')
+# E501 is line-too-long, which we enforce with black
+# W503,E203 are allowed by black
+# TODO: C901 is complexity, should be re-enabled at some point
+ignore = ANN003,ANN101,ANN204,ANN401,E265,E266,E501,C901,W503,E203
+per-file-ignores =
+ sandcrawler/__init__.py: F401
+ sandcrawler/ia.py: E402
+ tests/*.py: ANN201,ANN001,F403,F405
+ # TODO: add more annotations to CLI scripts
+ *_tool.py,sandcrawler_worker.py: ANN201,ANN001,ANN202,ANN206,ANN205,F403,F405
+ scripts: ANN201,ANN001,ANN202,ANN206,ANN205
+exclude = .git,__pycache__,.venv,scripts/
+max-line-length = 96
+max-complexity = 30
diff --git a/python/.gitignore b/python/.gitignore
index d53fac8..a5a773e 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -1,3 +1,14 @@
*part-000*
*.tar.gz
-*.tsv.gz
+*.gz
+htmlcov/
+samples/
+*.json
+TODO*
+*.tsv
+
+!.flake8
+!.gitlab-ci.yml
+!.pylintrc
+!.coveragerc
+!.gitignore
diff --git a/python/.pylintrc b/python/.pylintrc
index 80e203d..387bca1 100644
--- a/python/.pylintrc
+++ b/python/.pylintrc
@@ -11,4 +11,4 @@ include-ids=yes
notes=FIXME,XXX,DELETEME
[TYPECHECK]
-ignored-modules=responses
+extension-pkg-whitelist=selectolax,pydantic,responses
diff --git a/python/Makefile b/python/Makefile
new file mode 100644
index 0000000..940a7eb
--- /dev/null
+++ b/python/Makefile
@@ -0,0 +1,32 @@
+
+SHELL = /bin/bash
+.SHELLFLAGS = -o pipefail -c
+
+.PHONY: help
+help: ## Print info about all commands
+ @echo "Commands:"
+ @echo
+ @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}'
+
+.PHONY: deps
+deps: ## Install dependencies using pipenv
+ pipenv install --dev
+
+.PHONY: lint
+lint: ## Run lints (eg, flake8, mypy)
+ pipenv run flake8 . --exit-zero
+ pipenv run isort -q -c . || true
+ pipenv run mypy *.py sandcrawler/ tests/ --ignore-missing-imports
+
+.PHONY: fmt
+fmt: ## Run code formating on all source code
+ pipenv run isort --atomic .
+ pipenv run black --line-length 96 sandcrawler/ tests/ scripts/ *.py
+
+.PHONY: test
+test: ## Run all tests and lints
+ pipenv run pytest
+
+.PHONY: coverage
+coverage: ## Run all tests with coverage
+ pipenv run pytest --cov --cov-report=term --cov-report=html
diff --git a/python/Pipfile b/python/Pipfile
index eae64f3..b841755 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -1,6 +1,6 @@
[[source]]
name = "ia"
-url = "https://devpi.archive.org/wb/prod"
+url = "https://devpi.us.archive.org/wb/prod"
verify_ssl = true
[[source]]
@@ -9,36 +9,60 @@ url = "https://pypi.python.org/simple"
verify_ssl = true
[dev-packages]
-ipython = "*"
pytest = ">=4"
pytest-pythonpath = "*"
pytest-pylint = "*"
responses = ">=0.10"
pytest-cov = "*"
+pytest-mock = "*"
pylint = "*"
+ipython = "*"
+mypy = "*"
+flake8 = "*"
+flake8-annotations = "*"
+isort = "*"
+types-requests = "*"
+types-beautifulsoup4 = "*"
+types-dateparser = "*"
+types-psycopg2 = "*"
+types-Pillow = "*"
+black = "*"
[packages]
requests = ">=2"
-raven = {extras = ['flask'],version = "*"}
-pykafka = "*"
confluent-kafka = "*"
python-snappy = "*"
boto3 = "*"
-minio = "*"
+minio = "<7.0.0"
psycopg2 = "*"
bs4 = "*"
python-magic = "*"
ftfy = "*"
internetarchive = "*"
-Flask = ">=1"
+urlcanon = "*"
+Pillow = ">=3"
+python-poppler = ">=0.2.1"
+selectolax = ">=0.2"
+# constraining trafilatura to prevent a version conflict with
+# `charset_normalizer`, between htmldate and requests
+trafilatura = ">=1,<1.4"
+htmldate= ">=1,<1.4"
+pydantic = ">=1.7"
+dateparser = "*"
+braveblock = "*"
+dynaconf = ">=3"
+sentry-sdk = { version = ">=0.14.0", extras = [] }
+zstandard = "*"
+grobid_tei_xml = ">=0.1.2,<0.2.0"
+PyMuPDF = ">=1.19.0,<1.20.0"
[requires]
-python_version = "3.5"
+python_version = "3.8"
[packages.globalwayback]
-version = ">=0.3"
+version = ">=0.6.5"
index = "ia"
[packages.wayback]
-version = ">=0.2.1.2"
+version = ">=0.6.3"
index = "ia"
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index ba40df5..546a420 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,16 +1,16 @@
{
"_meta": {
"hash": {
- "sha256": "9cac128d516cbeba1d0c945f674c8c5953faecac09f03ef33c6f402a21f2fbb4"
+ "sha256": "35d0f0cd2f3903cce19d5a73f50a89ba09a1b43abbda84894fd45411d7f32760"
},
"pipfile-spec": 6,
"requires": {
- "python_version": "3.5"
+ "python_version": "3.8"
},
"sources": [
{
"name": "ia",
- "url": "https://devpi.archive.org/wb/prod",
+ "url": "https://devpi.us.archive.org/wb/prod",
"verify_ssl": true
},
{
@@ -21,80 +21,164 @@
]
},
"default": {
- "args": {
- "hashes": [
- "sha256:a785b8d837625e9b61c39108532d95b85274acd679693b71ebb5156848fcf814"
- ],
- "version": "==0.1.0"
- },
- "backports.csv": {
- "hashes": [
- "sha256:1277dfff73130b2e106bf3dd347adb3c5f6c4340882289d88f31240da92cbd6d",
- "sha256:21f6e09bab589e6c1f877edbc40277b65e626262a86e69a70137db714eaac5ce"
- ],
- "version": "==1.0.7"
+ "async-timeout": {
+ "hashes": [
+ "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15",
+ "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==4.0.2"
+ },
+ "backports.zoneinfo": {
+ "hashes": [
+ "sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf",
+ "sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328",
+ "sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546",
+ "sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6",
+ "sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570",
+ "sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9",
+ "sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7",
+ "sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987",
+ "sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722",
+ "sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582",
+ "sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc",
+ "sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b",
+ "sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1",
+ "sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08",
+ "sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac",
+ "sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2"
+ ],
+ "markers": "python_version < '3.9' and python_version >= '3.6' and python_version < '3.9'",
+ "version": "==0.2.1"
},
"beautifulsoup4": {
"hashes": [
- "sha256:05668158c7b85b791c5abde53e50265e16f98ad601c402ba44d70f96c4159612",
- "sha256:25288c9e176f354bf277c0a10aa96c782a6a18a17122dba2e8cec4a97e03343b",
- "sha256:f040590be10520f2ea4c2ae8c3dae441c7cfff5308ec9d58a0ec0c1b8f81d469"
- ],
- "version": "==4.8.0"
- },
- "blinker": {
- "hashes": [
- "sha256:471aee25f3992bd325afa3772f1063dbdbbca947a041b8b89466dc00d606f8b6"
+ "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30",
+ "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"
],
- "version": "==1.4"
+ "markers": "python_version >= '3.6'",
+ "version": "==4.11.1"
},
"boto3": {
"hashes": [
- "sha256:482ba0750cd9772c9d3bc67fe17a9470eba582aa6cf2810d7656a8d6e82a3b05",
- "sha256:c8593cf034e678e58b361f555f66f66e7f700ac32bd4341768744b48de9f1f98"
+ "sha256:7a6766c7177a9c6f85365e02aabd96ca4d72e08bc5cb127cb51b0a97ac9b9d1b",
+ "sha256:82b790b1dabd0746b028d2013b5d4d636a41f3aaf25520081f4c173cb6eb395d"
],
"index": "ia",
- "version": "==1.9.236"
+ "version": "==1.26.37"
},
"botocore": {
"hashes": [
- "sha256:6d1f302ea190e7287eb3909e851e04676f96803c962e46b4f716dd0cea99bce5",
- "sha256:b65e83183501ee89e2ec8e8f35c30a99ce2680dd41e7bf4f7241a86f4bb9f8a9"
+ "sha256:18ab8e95345a6d0d2653ce65d261a0aef6fef8a57a35a89e3cea6ffe315e92fc",
+ "sha256:3afa4fec9f7713caa05116563b38f81bec7bd20585d517155484d3f25efab5aa"
],
- "version": "==1.12.236"
+ "markers": "python_version >= '3.7'",
+ "version": "==1.29.37"
+ },
+ "braveblock": {
+ "hashes": [
+ "sha256:0bfca14473275366f2f822751c4e8dde7f94ee5ce8a9372244870452458f4fe1",
+ "sha256:107050b2e1c885b748727573a54a85d2e1ea9ad86146370f6eb79ca18b9673d4",
+ "sha256:13f9769eac9c4027eba2f400e635572796f7a7feb343f442d13c4b78e7d6f536",
+ "sha256:14efeada36418525da7c3b26393041b85242ffa1165328ec7eaf9b9780b72d62",
+ "sha256:1ab6980d10b8a02fd0dc73e28f18a0a3e17be636d314c1fdaa3bbb3e36a81f0f",
+ "sha256:45286418a43a3dfab50bdaf922f5003dbd2c3d1f696d23883568f4fa14b8093e",
+ "sha256:66c2442154102bff8df9c6f05cb72cd5cda6f4e1ed88592800ab1b6e8100e806",
+ "sha256:73de4f925ae5442d3361a71d7c0eeb1b4c540bf3d0c91100a00325ccef9e743c",
+ "sha256:80cbeeb6d083bc2a9106214188e5ce05362f248c1051344dc6673b7b38a561da",
+ "sha256:8460b10c9b82cc9d0b6056e1fe206bea209fe5a83ba87bdf9486305657224a44",
+ "sha256:903c506fc05eb6b76e4d31f957c1118078582db80f8ef5ce5ac74418f094d498",
+ "sha256:dcb773e3e275de896efebe57159a67587283d6ca1d1a36695170a3756fd2ef3a"
+ ],
+ "index": "ia",
+ "version": "==0.3.0"
},
"brotli": {
"hashes": [
- "sha256:0538dc1744fd17c314d2adc409ea7d1b779783b89fd95bcfb0c2acc93a6ea5a7",
- "sha256:0970a47f471782912d7705160b2b0a9306e68e6fadf9cffcaeb42d8f0951e26c",
- "sha256:113f51658e6fe548dce4b3749f6ef6c24de4184ba9c10a909cbee4261c2a5da0",
- "sha256:1e1aa9c4d1558889f42749c8baf846007953bfd32c8209230cf1cd1f5ef33495",
- "sha256:2f2f4f78f29ac4a45d15b3d9fc3fd9705e0ad313a44b129f6e1d0c6916bad0e2",
- "sha256:3269f6de1dd150fd0cce1c158b61ff5ac06d627fd3ae9c6ea03aed26fbbff7ea",
- "sha256:50dd9ad2a2bb12da4e9002a438672d182f98e546e99952de80280a1e1729664f",
- "sha256:5519a4b01b1a4f965083cbfa2ef2b9774c5a5f352341c47b50776ad109423d72",
- "sha256:5eb27722d320370315971c427eb8aa7cc0791f2a458840d357ac653bd0ad3a14",
- "sha256:5f06b4d5b6f58e5b5c220c2f23cad034dc5efa51b01fde2351ced1605bd980e2",
- "sha256:72848d25a5f9e736db4af4512e0c3feecc094d57d241f8f1ae959115a2c39756",
- "sha256:743001bca75f4a6b4454be3510feca46f9d61a0c782a9bc2bc684bdb245e279e",
- "sha256:9d1c2dd27a1083fefd05b1b2f8df4a6bc2aaa6c21dd82cd41c8ae5e7c23a87f8",
- "sha256:a13ce9b419fe9f277c63f700efb0e444331509d1881b5610d2ba7e9080606967",
- "sha256:a19ef0952b9d2803df88dff07f45a6c92d5676afb9b8d69cf32232d684036d11",
- "sha256:ad766ca8b8c1419b71a22756b45264f45725c86133dc80a7cbe30b6b78c75620",
- "sha256:ad7963f261988ee0883816b6b9f206f11461c9b3cb5cfbca0c9ab5adc406d395",
- "sha256:c16201060c5a3f8742e3deae759014251ac92f382f82bc2a41dc079ff18c3f24",
- "sha256:c43b202f65891861a9a336984a103de25de235f756de69e32db893156f767013",
- "sha256:c675c6cce4295cb1a692f3de7416aacace7314e064b94bc86e93aceefce7fd3e",
- "sha256:d17cec0b992b1434f5f9df9986563605a4d1b1acd5574c87fc2ac014bcbd3316",
- "sha256:dc91f6129953861a73d9a65c52a8dd682b561a9ebaf65283541645cab6489917",
- "sha256:e2f4cbd1760d2bf2f30e396c2301999aab0191aec031a6a8a04950b2f575a536",
- "sha256:f192e6d3556714105c10486bbd6d045e38a0c04d9da3cef21e0a8dfd8e162df4",
- "sha256:f775b07026af2b1b0b5a8b05e41571cdcf3a315a67df265d60af301656a5425b",
- "sha256:f969ec7f56ba9636679e69ca07fba548312ccaca37412ee823c7f413541ad7e0",
- "sha256:f9dc52cd70907aafb99a773b66b156f2f995c7a0d284397c487c8b71ddbef2f9",
- "sha256:fc7212e36ebeb81aebf7949c92897b622490d7c0e333a479c0395591e7994600"
- ],
- "version": "==1.0.7"
+ "sha256:02177603aaca36e1fd21b091cb742bb3b305a569e2402f1ca38af471777fb019",
+ "sha256:11d3283d89af7033236fa4e73ec2cbe743d4f6a81d41bd234f24bf63dde979df",
+ "sha256:12effe280b8ebfd389022aa65114e30407540ccb89b177d3fbc9a4f177c4bd5d",
+ "sha256:160c78292e98d21e73a4cc7f76a234390e516afcd982fa17e1422f7c6a9ce9c8",
+ "sha256:16d528a45c2e1909c2798f27f7bf0a3feec1dc9e50948e738b961618e38b6a7b",
+ "sha256:19598ecddd8a212aedb1ffa15763dd52a388518c4550e615aed88dc3753c0f0c",
+ "sha256:1c48472a6ba3b113452355b9af0a60da5c2ae60477f8feda8346f8fd48e3e87c",
+ "sha256:268fe94547ba25b58ebc724680609c8ee3e5a843202e9a381f6f9c5e8bdb5c70",
+ "sha256:269a5743a393c65db46a7bb982644c67ecba4b8d91b392403ad8a861ba6f495f",
+ "sha256:26d168aac4aaec9a4394221240e8a5436b5634adc3cd1cdf637f6645cecbf181",
+ "sha256:29d1d350178e5225397e28ea1b7aca3648fcbab546d20e7475805437bfb0a130",
+ "sha256:2aad0e0baa04517741c9bb5b07586c642302e5fb3e75319cb62087bd0995ab19",
+ "sha256:3148362937217b7072cf80a2dcc007f09bb5ecb96dae4617316638194113d5be",
+ "sha256:330e3f10cd01da535c70d09c4283ba2df5fb78e915bea0a28becad6e2ac010be",
+ "sha256:336b40348269f9b91268378de5ff44dc6fbaa2268194f85177b53463d313842a",
+ "sha256:3496fc835370da351d37cada4cf744039616a6db7d13c430035e901443a34daa",
+ "sha256:35a3edbe18e876e596553c4007a087f8bcfd538f19bc116917b3c7522fca0429",
+ "sha256:3b78a24b5fd13c03ee2b7b86290ed20efdc95da75a3557cc06811764d5ad1126",
+ "sha256:3b8b09a16a1950b9ef495a0f8b9d0a87599a9d1f179e2d4ac014b2ec831f87e7",
+ "sha256:3c1306004d49b84bd0c4f90457c6f57ad109f5cc6067a9664e12b7b79a9948ad",
+ "sha256:3ffaadcaeafe9d30a7e4e1e97ad727e4f5610b9fa2f7551998471e3736738679",
+ "sha256:40d15c79f42e0a2c72892bf407979febd9cf91f36f495ffb333d1d04cebb34e4",
+ "sha256:44bb8ff420c1d19d91d79d8c3574b8954288bdff0273bf788954064d260d7ab0",
+ "sha256:4688c1e42968ba52e57d8670ad2306fe92e0169c6f3af0089be75bbac0c64a3b",
+ "sha256:495ba7e49c2db22b046a53b469bbecea802efce200dffb69b93dd47397edc9b6",
+ "sha256:4d1b810aa0ed773f81dceda2cc7b403d01057458730e309856356d4ef4188438",
+ "sha256:503fa6af7da9f4b5780bb7e4cbe0c639b010f12be85d02c99452825dd0feef3f",
+ "sha256:56d027eace784738457437df7331965473f2c0da2c70e1a1f6fdbae5402e0389",
+ "sha256:5913a1177fc36e30fcf6dc868ce23b0453952c78c04c266d3149b3d39e1410d6",
+ "sha256:5b6ef7d9f9c38292df3690fe3e302b5b530999fa90014853dcd0d6902fb59f26",
+ "sha256:5bf37a08493232fbb0f8229f1824b366c2fc1d02d64e7e918af40acd15f3e337",
+ "sha256:5cb1e18167792d7d21e21365d7650b72d5081ed476123ff7b8cac7f45189c0c7",
+ "sha256:61a7ee1f13ab913897dac7da44a73c6d44d48a4adff42a5701e3239791c96e14",
+ "sha256:622a231b08899c864eb87e85f81c75e7b9ce05b001e59bbfbf43d4a71f5f32b2",
+ "sha256:68715970f16b6e92c574c30747c95cf8cf62804569647386ff032195dc89a430",
+ "sha256:6b2ae9f5f67f89aade1fab0f7fd8f2832501311c363a21579d02defa844d9296",
+ "sha256:6c772d6c0a79ac0f414a9f8947cc407e119b8598de7621f39cacadae3cf57d12",
+ "sha256:6d847b14f7ea89f6ad3c9e3901d1bc4835f6b390a9c71df999b0162d9bb1e20f",
+ "sha256:73fd30d4ce0ea48010564ccee1a26bfe39323fde05cb34b5863455629db61dc7",
+ "sha256:76ffebb907bec09ff511bb3acc077695e2c32bc2142819491579a695f77ffd4d",
+ "sha256:7bbff90b63328013e1e8cb50650ae0b9bac54ffb4be6104378490193cd60f85a",
+ "sha256:7cb81373984cc0e4682f31bc3d6be9026006d96eecd07ea49aafb06897746452",
+ "sha256:7ee83d3e3a024a9618e5be64648d6d11c37047ac48adff25f12fa4226cf23d1c",
+ "sha256:854c33dad5ba0fbd6ab69185fec8dab89e13cda6b7d191ba111987df74f38761",
+ "sha256:85f7912459c67eaab2fb854ed2bc1cc25772b300545fe7ed2dc03954da638649",
+ "sha256:87fdccbb6bb589095f413b1e05734ba492c962b4a45a13ff3408fa44ffe6479b",
+ "sha256:88c63a1b55f352b02c6ffd24b15ead9fc0e8bf781dbe070213039324922a2eea",
+ "sha256:8a674ac10e0a87b683f4fa2b6fa41090edfd686a6524bd8dedbd6138b309175c",
+ "sha256:8ed6a5b3d23ecc00ea02e1ed8e0ff9a08f4fc87a1f58a2530e71c0f48adf882f",
+ "sha256:93130612b837103e15ac3f9cbacb4613f9e348b58b3aad53721d92e57f96d46a",
+ "sha256:9744a863b489c79a73aba014df554b0e7a0fc44ef3f8a0ef2a52919c7d155031",
+ "sha256:9749a124280a0ada4187a6cfd1ffd35c350fb3af79c706589d98e088c5044267",
+ "sha256:97f715cf371b16ac88b8c19da00029804e20e25f30d80203417255d239f228b5",
+ "sha256:9bf919756d25e4114ace16a8ce91eb340eb57a08e2c6950c3cebcbe3dff2a5e7",
+ "sha256:9d12cf2851759b8de8ca5fde36a59c08210a97ffca0eb94c532ce7b17c6a3d1d",
+ "sha256:9ed4c92a0665002ff8ea852353aeb60d9141eb04109e88928026d3c8a9e5433c",
+ "sha256:a72661af47119a80d82fa583b554095308d6a4c356b2a554fdc2799bc19f2a43",
+ "sha256:afde17ae04d90fbe53afb628f7f2d4ca022797aa093e809de5c3cf276f61bbfa",
+ "sha256:b1375b5d17d6145c798661b67e4ae9d5496920d9265e2f00f1c2c0b5ae91fbde",
+ "sha256:b336c5e9cf03c7be40c47b5fd694c43c9f1358a80ba384a21969e0b4e66a9b17",
+ "sha256:b3523f51818e8f16599613edddb1ff924eeb4b53ab7e7197f85cbc321cdca32f",
+ "sha256:b43775532a5904bc938f9c15b77c613cb6ad6fb30990f3b0afaea82797a402d8",
+ "sha256:b663f1e02de5d0573610756398e44c130add0eb9a3fc912a09665332942a2efb",
+ "sha256:b83bb06a0192cccf1eb8d0a28672a1b79c74c3a8a5f2619625aeb6f28b3a82bb",
+ "sha256:ba72d37e2a924717990f4d7482e8ac88e2ef43fb95491eb6e0d124d77d2a150d",
+ "sha256:c2415d9d082152460f2bd4e382a1e85aed233abc92db5a3880da2257dc7daf7b",
+ "sha256:c83aa123d56f2e060644427a882a36b3c12db93727ad7a7b9efd7d7f3e9cc2c4",
+ "sha256:c8e521a0ce7cf690ca84b8cc2272ddaf9d8a50294fd086da67e517439614c755",
+ "sha256:cab1b5964b39607a66adbba01f1c12df2e55ac36c81ec6ed44f2fca44178bf1a",
+ "sha256:cb02ed34557afde2d2da68194d12f5719ee96cfb2eacc886352cb73e3808fc5d",
+ "sha256:cc0283a406774f465fb45ec7efb66857c09ffefbe49ec20b7882eff6d3c86d3a",
+ "sha256:cfc391f4429ee0a9370aa93d812a52e1fee0f37a81861f4fdd1f4fb28e8547c3",
+ "sha256:db844eb158a87ccab83e868a762ea8024ae27337fc7ddcbfcddd157f841fdfe7",
+ "sha256:defed7ea5f218a9f2336301e6fd379f55c655bea65ba2476346340a0ce6f74a1",
+ "sha256:e16eb9541f3dd1a3e92b89005e37b1257b157b7256df0e36bd7b33b50be73bcb",
+ "sha256:e1abbeef02962596548382e393f56e4c94acd286bd0c5afba756cffc33670e8a",
+ "sha256:e23281b9a08ec338469268f98f194658abfb13658ee98e2b7f85ee9dd06caa91",
+ "sha256:e2d9e1cbc1b25e22000328702b014227737756f4b5bf5c485ac1d8091ada078b",
+ "sha256:e48f4234f2469ed012a98f4b7874e7f7e173c167bed4934912a29e03167cf6b1",
+ "sha256:e4c4e92c14a57c9bd4cb4be678c25369bf7a092d55fd0866f759e425b9660806",
+ "sha256:ec1947eabbaf8e0531e8e899fc1d9876c179fc518989461f5d24e2223395a9e3",
+ "sha256:f909bbbc433048b499cb9db9e713b5d8d949e8c109a2a548502fb9aa8630f0b1"
+ ],
+ "version": "==1.0.9"
},
"bs4": {
"hashes": [
@@ -105,97 +189,164 @@
},
"certifi": {
"hashes": [
- "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50",
- "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef"
+ "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
+ "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
],
- "version": "==2019.9.11"
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.12.7"
},
"chardet": {
"hashes": [
- "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
- "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+ "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5",
+ "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"
],
- "version": "==3.0.4"
+ "markers": "python_version >= '3.7'",
+ "version": "==5.1.0"
},
- "click": {
+ "charset-normalizer": {
"hashes": [
- "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
- "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
+ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
+ "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
- "version": "==7.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.1.1"
},
- "clint": {
+ "configparser": {
"hashes": [
- "sha256:05224c32b1075563d0b16d0015faaf9da43aa214e4a2140e51f08789e7a4c5aa"
+ "sha256:8be267824b541c09b08db124917f48ab525a6c3e837011f3130781a224c57090",
+ "sha256:b065779fd93c6bf4cee42202fa4351b4bb842e96a3fb469440e484517a49b9fa"
],
- "version": "==0.5.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==5.3.0"
},
"confluent-kafka": {
"hashes": [
- "sha256:13d0146850c046b9e3dabbcb60bd7c4f02c7d4754b693266ee5bb0884faea2c7",
- "sha256:1505de9c652f9b841ba600f35e6a7f4cae2e239e9b8255212433756d17c1aeed",
- "sha256:1832373eee96b0ef246c773ec2613c382bf4577f0d42e2ce688e8d35ca373e69",
- "sha256:3001d09e5efa357eb9f3166ef54703166d7b662eb55841b760a346313ca717ee",
- "sha256:31b934aa821e4d6aff06dd260b14c03ca58a532bb160032acff573e2de0a4467",
- "sha256:359d776fb0381147e4c9981676d3907f9dbd12733597a6fec455e5a516728ae2",
- "sha256:472f63ee352a1464ff40325cc3a86aae2685716985bb2c9cad7e574f4203d664",
- "sha256:53ec15b8b76109489c77c0fe3d2f5ff71398cef2db74053c4818f6667a14471d",
- "sha256:6299420c462b274095d4624cf246dc90e49d6d3c1acd8b7222d39eb407476301",
- "sha256:8a497aee87d01891090c9c59ee760387bc41c718a358dadc926a6cae36dfa4fa",
- "sha256:8bd48e88b2d9ba42b58e8c5c1a266a4a7945757d1ae1c2bad3c0fbdf7b52c5e5",
- "sha256:92684d50215c111025ebb40c813f695fd9df69f763f04e9b25fc4fd67e5c0d06",
- "sha256:957417782592136c91c81d7161be8c6fc334f65d229a672d18c4cc85a0a09532",
- "sha256:a41eda5c84e153e0ec8d854c064ebe242a8a6993f9069f6ae83dfd5f9dda1ac6",
- "sha256:b4e5b7f5f597d4a40e0148e1fe509ab800231542ce7e8765265e8c251e6e3983",
- "sha256:c41d12f4e1deb54e1fa6e40260d95aa7b1c1ebfa4bbddb8a03f06fde3611da71",
- "sha256:c8b75e6e803e1a4346d12f212ead3871f86cf21e4d6b5028e8701f8e9c8f105b",
- "sha256:c8dee478a46a9352224fce1d12756cff6cf50e2684121a118141c7908ae9ed3e",
- "sha256:d072825c0f9dec85ebc5eaa887c324914cb3631c884b35fc3c173663d4222a3a",
- "sha256:d328d11edfe049cbdd6bf0792bcff9a9afce26c25124f198fe0ea1f910b529b3",
- "sha256:d52708474daec030399cf45a6bff55b13d631cdba6cc6546da351f675a901d44",
- "sha256:dd02a60145961957ad293aed30a435de2069024aec134575bea937e50df211b3",
- "sha256:eb900ef15b340a87185412b906cbef2450ab68907b84988a6af7fc8e1b017b6c",
- "sha256:fa5d985e30fbcbccaf8b9b1e52137abfbbe0f913fc1baddb96b9476824013411",
- "sha256:fb8ba7ddf85c80eb4d2e3c2f1b362793b9f5c78950bc1e93f60423241071fd9b",
- "sha256:ff80ceca738f52d9058d146b7b063092a88281362606e6c917af2a9e60087a3d"
+ "sha256:24872e3e427b16f77461ae7e6cf48f9c5c03c06884ac51bad179580a4dd29145",
+ "sha256:2fb97bd25d436bd59fe079885aa77a3a2f23cface9c6359d4700053665849262",
+ "sha256:3207c76d1510571cbda85560c293dec5f8d6645103b3f471abab5c83e51a7ccd",
+ "sha256:344a7fec57d3348002392a7bd5cf66fb9dbe4a103e81636037cccd6fff944e28",
+ "sha256:382739e499deaf488459c2307ebcc0e9b3653340801d6053c207c84ad710ee8d",
+ "sha256:4d6bfcc352cd608fcf325037b4425c0edaeae0c6a5439423a865110b59f897e9",
+ "sha256:4f27ddf7daf630a95e1d7dfddd0c8cf8a7755c9567dc9851bf2e74c22f34af42",
+ "sha256:5b24587b30a4d288a7b1c5cc756ee707fc1293fa28454f8db40267ed9d7e73c8",
+ "sha256:6ab745babc33a864e3ca3a2659c005ed52503e39936fff5812eeb21920009c8b",
+ "sha256:7e6592533b3f8cfbc086ea2d472058f10e5f6a04a388edca01773285c63284b4",
+ "sha256:b9ad6ad9d58c2735129f94f044b2236d7de87d77a101c8c630363add12d62a4a",
+ "sha256:be7b37020f614017d8a047565b3fb61ceef9c30a9ee093f9373d06a4c32068ae",
+ "sha256:bef263b6d78a3e63399e1b82cc07cbb30af762884df96a369cba0e1011405344",
+ "sha256:c4b7c4d0b647952d2b506948131d6e7e1c42ccb16aac8e3e52369c16b94e7215",
+ "sha256:d036bf5e1d7cb3743125d7caf62b1a23b12e403de240144b6117ddbb8f815a33",
+ "sha256:d0cbf8e7510497afd651e134bccb9d579aa90234e45734046fcb6b752d2ee312",
+ "sha256:d533ea0e527122f177943ee35eb356b8d9f7af35fe357e0cdc0514d95804aaea",
+ "sha256:e41b9313c44f54a3cd29b0e95fa32a8e685edaa9287b338f59530b21ebc0b453",
+ "sha256:e9107767cc9240cbf9b5c0fdded5eeead86a1690d1c15de6cbbdcc9d7e3b1962",
+ "sha256:f96033c335da26ea1716ab9adfce459c211b023ca09528f958fb28bf099fc0df",
+ "sha256:f970a2c6d22c934ea68d645abcc96056ecb107489f28a38b2171f65655b7e41f",
+ "sha256:fe31b3b6930d67380df371f5088950f93da5fac580cde3bedb35f992b2498e1b",
+ "sha256:ff08b9f978f8b37f2961614a68f9fdb4fabd10cdd940234e80200806d93a1c30",
+ "sha256:ff4d1557b7fb72e752c36205a344863b8f4f23b3a834780fc36eb7ebde614de7"
],
"index": "ia",
- "version": "==1.1.0"
+ "version": "==1.9.2"
},
"contextlib2": {
"hashes": [
- "sha256:509f9419ee91cdd00ba34443217d5ca51f5a364a404e1dce9e8979cea969ca48",
- "sha256:f5260a6e679d2ff42ec91ec5252f4eeffdcf21053db9113bd0a8e4d953769c00"
+ "sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f",
+ "sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869"
],
- "version": "==0.5.5"
+ "markers": "python_version >= '3.6'",
+ "version": "==21.6.0"
+ },
+ "courlan": {
+ "hashes": [
+ "sha256:d06c5b048b2b5cd5c0ac77304dc24b795e4bb257a7b6077ea405a3b5e99ae179",
+ "sha256:d141d30f8e52d344cf9904aa29e4d8750e934026bdbca2dc7bd58b750566d058"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==0.8.3"
},
"crawllib": {
"hashes": [
- "sha256:89ef1844e72b4e89508f0d343b06d8931ea8edef12729cb81aaca577a9c3874e"
+ "sha256:9a30a10318dc706f1e27ff0af950ac14a77f73c18d329771f44d872fd63630e3"
+ ],
+ "version": "==0.1.6"
+ },
+ "cython": {
+ "hashes": [
+ "sha256:061e25151c38f2361bc790d3bcf7f9d9828a0b6a4d5afa56fbed3bd33fb2373a",
+ "sha256:06be83490c906b6429b4389e13487a26254ccaad2eef6f3d4ee21d8d3a4aaa2b",
+ "sha256:07d173d3289415bb496e72cb0ddd609961be08fe2968c39094d5712ffb78672b",
+ "sha256:0bbc27abdf6aebfa1bce34cd92bd403070356f28b0ecb3198ff8a182791d58b9",
+ "sha256:0ea8267fc373a2c5064ad77d8ff7bf0ea8b88f7407098ff51829381f8ec1d5d9",
+ "sha256:3875c2b2ea752816a4d7ae59d45bb546e7c4c79093c83e3ba7f4d9051dd02928",
+ "sha256:39afb4679b8c6bf7ccb15b24025568f4f9b4d7f9bf3cbd981021f542acecd75b",
+ "sha256:3f85eb2343d20d91a4ea9cf14e5748092b376a64b7e07fc224e85b2753e9070b",
+ "sha256:40eff7aa26e91cf108fd740ffd4daf49f39b2fdffadabc7292b4b7dc5df879f0",
+ "sha256:479690d2892ca56d34812fe6ab8f58e4b2e0129140f3d94518f15993c40553da",
+ "sha256:4a4b03ab483271f69221c3210f7cde0dcc456749ecf8243b95bc7a701e5677e0",
+ "sha256:513e9707407608ac0d306c8b09d55a28be23ea4152cbd356ceaec0f32ef08d65",
+ "sha256:5514f3b4122cb22317122a48e175a7194e18e1803ca555c4c959d7dfe68eaf98",
+ "sha256:5ba622326f2862f9c1f99ca8d47ade49871241920a352c917e16861e25b0e5c3",
+ "sha256:63b79d9e1f7c4d1f498ab1322156a0d7dc1b6004bf981a8abda3f66800e140cd",
+ "sha256:656dc5ff1d269de4d11ee8542f2ffd15ab466c447c1f10e5b8aba6f561967276",
+ "sha256:67fdd2f652f8d4840042e2d2d91e15636ba2bcdcd92e7e5ffbc68e6ef633a754",
+ "sha256:79e3bab19cf1b021b613567c22eb18b76c0c547b9bc3903881a07bfd9e7e64cf",
+ "sha256:856d2fec682b3f31583719cb6925c6cdbb9aa30f03122bcc45c65c8b6f515754",
+ "sha256:8669cadeb26d9a58a5e6b8ce34d2c8986cc3b5c0bfa77eda6ceb471596cb2ec3",
+ "sha256:8733cf4758b79304f2a4e39ebfac5e92341bce47bcceb26c1254398b2f8c1af7",
+ "sha256:97335b2cd4acebf30d14e2855d882de83ad838491a09be2011745579ac975833",
+ "sha256:afbce249133a830f121b917f8c9404a44f2950e0e4f5d1e68f043da4c2e9f457",
+ "sha256:b0595aee62809ba353cebc5c7978e0e443760c3e882e2c7672c73ffe46383673",
+ "sha256:b6da3063c5c476f5311fd76854abae6c315f1513ef7d7904deed2e774623bbb9",
+ "sha256:c8e8025f496b5acb6ba95da2fb3e9dacffc97d9a92711aacfdd42f9c5927e094",
+ "sha256:cddc47ec746a08603037731f5d10aebf770ced08666100bd2cdcaf06a85d4d1b",
+ "sha256:cdf10af3e2e3279dc09fdc5f95deaa624850a53913f30350ceee824dc14fc1a6",
+ "sha256:d968ffc403d92addf20b68924d95428d523436adfd25cf505d427ed7ba3bee8b",
+ "sha256:dbee03b8d42dca924e6aa057b836a064c769ddfd2a4c2919e65da2c8a362d528",
+ "sha256:e1958e0227a4a6a2c06fd6e35b7469de50adf174102454db397cec6e1403cce3",
+ "sha256:e6ffa08aa1c111a1ebcbd1cf4afaaec120bc0bbdec3f2545f8bb7d3e8e77a1cd",
+ "sha256:e83228e0994497900af954adcac27f64c9a57cd70a9ec768ab0cb2c01fd15cf1",
+ "sha256:ea1dcc07bfb37367b639415333cfbfe4a93c3be340edf1db10964bc27d42ed64",
+ "sha256:eca3065a1279456e81c615211d025ea11bfe4e19f0c5650b859868ca04b3fcbd",
+ "sha256:ed087eeb88a8cf96c60fb76c5c3b5fb87188adee5e179f89ec9ad9a43c0c54b3",
+ "sha256:eeb475eb6f0ccf6c039035eb4f0f928eb53ead88777e0a760eccb140ad90930b",
+ "sha256:eefd2b9a5f38ded8d859fe96cc28d7d06e098dc3f677e7adbafda4dcdd4a461c",
+ "sha256:f3fd44cc362eee8ae569025f070d56208908916794b6ab21e139cea56470a2b3",
+ "sha256:f9944013588a3543fca795fffb0a070a31a243aa4f2d212f118aa95e69485831"
+ ],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==0.29.32"
+ },
+ "dateparser": {
+ "hashes": [
+ "sha256:4431159799b63d8acec5d7d844c5e06edf3d1b0eb2bda6d4cac87134ddddd01c",
+ "sha256:73ec6e44a133c54076ecf9f9dc0fbe3dd4831f154f977ff06f53114d57c5425e"
],
- "version": "==0.1.4.3"
+ "index": "ia",
+ "version": "==1.1.4"
},
"dawg": {
"hashes": [
- "sha256:111aec946fc6045776e8a977f8be841b099769f3c8ab041dba4773ffeda21ad5",
- "sha256:30d5da3e48b8cbe5ec94c5a202d2962780d3895ba0883123e6788565f71b2953",
- "sha256:3a5ea13d5a424542d1a7fa908db974e712be90ccdd86cec9e24c6b20794f5f5e",
- "sha256:402659e3044a5fb79dadefeaabb15ba9c0ef56c844bb4bcde6b102afbf4788f8",
- "sha256:7accbfe484a353e1f02a947f84f817846f30738d1170d4e855f536d5708632a3",
- "sha256:7d0a904e91adfa3de7071bfe64cd1334ce4040f1795cca8c13598bd075e72e18",
- "sha256:9c7321d4f2a580506e06c29ed276ae50df9eb153470e8e980e79409e12b18e55",
- "sha256:ad0fdd2f6ed0a0155f00e7f61f3649898dabf7e344eb87732b34414f34cc31d9",
- "sha256:b1f9c72bb3eca530f78fcf82f2d60ff41298f10e1c9f018b402af0ecbe246171",
- "sha256:d6d5f9e4a37bf9b2c4fec504eaf8cfc30d7f994635c35a6f14ced5f41a72e2f9"
+ "sha256:28c4c934ab1ca74226a46e6213f919f5b0912bf9de87218264d4d94c15521753",
+ "sha256:34881e06278d4a54cf0b402c0c8b587bef0caa78f0eee595adc7a2aa530e48ce",
+ "sha256:73760ad1272b1b47997f1a768b8f3bf547c92475bcd62185f4ab7e1bc691964e",
+ "sha256:7aecc4c89243edaf1efe7a4d769d993a7cd9307a8a04f48e07c4fc7c44bdd38f",
+ "sha256:83ce4a73f7632b0ed31af16c2750533ecbed347bad1148a52f6436e348b5b7ac",
+ "sha256:a5a0ae005de5095d53139895d71d09d78a613f8884583a34725b177fd53ada29",
+ "sha256:d78929f5a7f7e083f5720992068535d133f0d3326f0c677c61c59256aa43d95e",
+ "sha256:e664a884ca48f2599ad5c2289d9b7f769e77d266560c79992e3db2cfce96cb1b",
+ "sha256:fb90b799fb7d6d728531840529c812a9ee17736da71e8a596ede8bfd6c62bf36",
+ "sha256:feb6073e0d02ac54389ad378e6c695e28fe579e2772c225a854299752effece6"
],
- "version": "==0.7.8"
+ "version": "==0.8.0"
},
"decorator": {
"hashes": [
- "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de",
- "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6"
+ "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330",
+ "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"
],
- "version": "==4.4.0"
+ "markers": "python_version >= '3.5'",
+ "version": "==5.1.1"
},
"docopt": {
"hashes": [
@@ -203,60 +354,72 @@
],
"version": "==0.6.2"
},
- "docutils": {
+ "dogpile.cache": {
"hashes": [
- "sha256:6c4f696463b79f1fb8ba0c594b63840ebd41f059e92b31957c46b74a4599b6d0",
- "sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827",
- "sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99"
+ "sha256:bc9dde1ffa5de0179efbcdc73773ef0553921130ad01955422f2932be35c059e"
],
- "version": "==0.15.2"
+ "version": "==0.9.2"
},
- "dogpile.cache": {
+ "dynaconf": {
"hashes": [
- "sha256:70f5eae4aec908f76188a2c287e07105f60c05d879bb9a4efcc5ba44563d8de6"
+ "sha256:87e0b3b12b5db9e8fb465e1f8c7fdb926cd2ec5b6d88aa7f821f316df93fb165",
+ "sha256:d9cfb50fd4a71a543fd23845d4f585b620b6ff6d9d3cc1825c614f7b2097cb39"
],
- "version": "==0.8.0"
+ "index": "ia",
+ "version": "==3.1.11"
},
"elasticsearch": {
"hashes": [
- "sha256:1f0f633e3b500d5042424f75a505badf8c4b9962c1b4734cdfb3087fb67920be",
- "sha256:fb5ab15ee283f104b5a7a5695c7e879cb2927e4eb5aed9c530811590b41259ad"
+ "sha256:840adeb45a5ec9102a83f3cf481aae83a3775b75d6dd83a7310b04e44a5d0308",
+ "sha256:f511ea92e96db09b0e96b0de5fbbb7aa5c3740b0c571a364a2c3a1cc7ec06203"
],
- "version": "==6.4.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' and python_version < '4'",
+ "version": "==7.17.8"
},
- "flask": {
+ "filelock": {
"hashes": [
- "sha256:13f9f196f330c7c2c5d7a5cf91af894110ca0215ac051b5844701f2bfd934d52",
- "sha256:45eb5a6fd193d6cf7e0cf5d8a5b31f83d5faae0293695626f539a823e93b13f6"
+ "sha256:7565f628ea56bfcd8e54e42bdc55da899c85c1abfe1b5bcfd147e9188cebb3b2",
+ "sha256:8df285554452285f79c035efb0c861eb33a4bcfa5b7a137016e32e6a90f9792c"
],
- "index": "ia",
- "version": "==1.1.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.8.2"
},
"ftfy": {
"hashes": [
- "sha256:6d7509c45e602dec890f0f6ee0623a8b5f50ec1188ac7ab9535e18e572c99bcc"
+ "sha256:0ffd33fce16b54cccaec78d6ec73d95ad370e5df5a25255c8966a6147bd667ca",
+ "sha256:bfc2019f84fcd851419152320a6375604a0f1459c281b5b199b2cd0d2e727f8f"
],
"index": "ia",
- "version": "==5.6"
+ "version": "==6.1.1"
},
- "future": {
+ "globalwayback": {
"hashes": [
- "sha256:67045236dcfd6816dc439556d009594abf643e5eb48992e36beac09c2ca659b8"
+ "sha256:683f19dee720ef11335952aa33615e50c945196c82e18a5d8150635f92022d23"
],
- "version": "==0.17.1"
+ "index": "ia",
+ "version": "==0.8.12.6"
},
- "globalwayback": {
+ "grobid-tei-xml": {
+ "hashes": [
+ "sha256:022fdf54dbd067b520c1effe3c1a1f2ac248492ea310627e9462757748cb461b",
+ "sha256:35c9afb14f6f76100dce5f5815e67ec9fa4122e2f268394e0baf6eafbd8668d8"
+ ],
+ "index": "ia",
+ "version": "==0.1.3"
+ },
+ "htmldate": {
"hashes": [
- "sha256:50acd135994223db1e69969ff20db858e1aae4edd67d1d23ded89b7b5a7216d0"
+ "sha256:603b86eaf0f076efcd653d57fe0470305f751417711f4e373279235d0ff587e6",
+ "sha256:83830715faf0f22272d9e24e571a4955308a008107d0ca9359c0de77b99766cd"
],
"index": "ia",
- "version": "==0.4.9.2"
+ "version": "==1.3.2"
},
"ialib": {
"hashes": [
- "sha256:30291b8645057cc210d7ec129f17dc25afc63ee09db7cda1657c47408b2ba8dc"
+ "sha256:0b1745e512266fd6c91af68763f2f8427eec6c92c5009fc75c50d9352fc764fc"
],
- "version": "==0.3.0.1"
+ "version": "==0.5.1.1"
},
"idna": {
"hashes": [
@@ -267,95 +430,201 @@
},
"internetarchive": {
"hashes": [
- "sha256:13c7d3aef852c3e35bb1fec6bd7ab2b898dd4b01bd98c90f902eb0d5dc2e3949",
- "sha256:2e7477ed3fe43f1d09853b281c8034deafacf7ebc614ba80788c90ecdbf0c3d3"
+ "sha256:de856465c2ef6852184d08bfd59c0ca01904865b373a27b383034ac6b4128eb6"
],
"index": "ia",
- "version": "==1.8.5"
- },
- "itsdangerous": {
- "hashes": [
- "sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19",
- "sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749"
- ],
- "version": "==1.1.0"
+ "version": "==3.0.2"
},
"jinja2": {
"hashes": [
- "sha256:065c4f02ebe7f7cf559e49ee5a95fb800a9e4528727aec6f24402a5374c65013",
- "sha256:14dd6caf1527abb21f08f86c784eac40853ba93edb79552aa1e4b8aef1b61c7b"
+ "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
+ "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
],
- "version": "==2.10.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.1.2"
},
"jmespath": {
"hashes": [
- "sha256:3720a4b1bd659dd2eecad0666459b9788813e032b83e7ba58578e48254e0a0e6",
- "sha256:bde2aef6f44302dfb30320115b17d030798de8c4110e28d5cf6cf91a7a31074c"
+ "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980",
+ "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"
],
- "version": "==0.9.4"
+ "markers": "python_version >= '3.7'",
+ "version": "==1.0.1"
},
"jsonpatch": {
"hashes": [
- "sha256:83f29a2978c13da29bfdf89da9d65542d62576479caf215df19632d7dc04c6e6",
- "sha256:cbb72f8bf35260628aea6b508a107245f757d1ec839a19c34349985e2c05645a"
+ "sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397",
+ "sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"
],
- "version": "==1.24"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==1.32"
},
"jsonpointer": {
"hashes": [
- "sha256:c192ba86648e05fdae4f08a17ec25180a9aef5008d973407b581798a83975362",
- "sha256:ff379fa021d1b81ab539f5ec467c7745beb1a5671463f9dcc2b2d458bd361c1e"
- ],
- "version": "==2.0"
- },
- "kazoo": {
- "hashes": [
- "sha256:8db774f7bdece7d0dc7decb21539ff0852e42c2ffe1c28d7f1ff6f9292a1c3a4",
- "sha256:a5fa2e400c5068cfee9e86b35cf0dab8232b574152d8e3590d823b3e2426ab5e"
- ],
- "version": "==2.5.0"
+ "sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9",
+ "sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==2.3"
+ },
+ "justext": {
+ "hashes": [
+ "sha256:7640e248218795f6be65f6c35fe697325a3280fcb4675d1525bcdff2b86faadf",
+ "sha256:86b48f5b1d99505acd072f5831def6cd3f1306043651c524a1c609e62e3544e4"
+ ],
+ "version": "==3.0.0"
+ },
+ "langcodes": {
+ "hashes": [
+ "sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69",
+ "sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==3.3.0"
+ },
+ "lxml": {
+ "hashes": [
+ "sha256:01d36c05f4afb8f7c20fd9ed5badca32a2029b93b1750f571ccc0b142531caf7",
+ "sha256:04876580c050a8c5341d706dd464ff04fd597095cc8c023252566a8826505726",
+ "sha256:05ca3f6abf5cf78fe053da9b1166e062ade3fa5d4f92b4ed688127ea7d7b1d03",
+ "sha256:090c6543d3696cbe15b4ac6e175e576bcc3f1ccfbba970061b7300b0c15a2140",
+ "sha256:0dc313ef231edf866912e9d8f5a042ddab56c752619e92dfd3a2c277e6a7299a",
+ "sha256:0f2b1e0d79180f344ff9f321327b005ca043a50ece8713de61d1cb383fb8ac05",
+ "sha256:13598ecfbd2e86ea7ae45ec28a2a54fb87ee9b9fdb0f6d343297d8e548392c03",
+ "sha256:16efd54337136e8cd72fb9485c368d91d77a47ee2d42b057564aae201257d419",
+ "sha256:1ab8f1f932e8f82355e75dda5413a57612c6ea448069d4fb2e217e9a4bed13d4",
+ "sha256:223f4232855ade399bd409331e6ca70fb5578efef22cf4069a6090acc0f53c0e",
+ "sha256:2455cfaeb7ac70338b3257f41e21f0724f4b5b0c0e7702da67ee6c3640835b67",
+ "sha256:2899456259589aa38bfb018c364d6ae7b53c5c22d8e27d0ec7609c2a1ff78b50",
+ "sha256:2a29ba94d065945944016b6b74e538bdb1751a1db6ffb80c9d3c2e40d6fa9894",
+ "sha256:2a87fa548561d2f4643c99cd13131acb607ddabb70682dcf1dff5f71f781a4bf",
+ "sha256:2e430cd2824f05f2d4f687701144556646bae8f249fd60aa1e4c768ba7018947",
+ "sha256:36c3c175d34652a35475a73762b545f4527aec044910a651d2bf50de9c3352b1",
+ "sha256:3818b8e2c4b5148567e1b09ce739006acfaa44ce3156f8cbbc11062994b8e8dd",
+ "sha256:3ab9fa9d6dc2a7f29d7affdf3edebf6ece6fb28a6d80b14c3b2fb9d39b9322c3",
+ "sha256:3efea981d956a6f7173b4659849f55081867cf897e719f57383698af6f618a92",
+ "sha256:4c8f293f14abc8fd3e8e01c5bd86e6ed0b6ef71936ded5bf10fe7a5efefbaca3",
+ "sha256:5344a43228767f53a9df6e5b253f8cdca7dfc7b7aeae52551958192f56d98457",
+ "sha256:58bfa3aa19ca4c0f28c5dde0ff56c520fbac6f0daf4fac66ed4c8d2fb7f22e74",
+ "sha256:5b4545b8a40478183ac06c073e81a5ce4cf01bf1734962577cf2bb569a5b3bbf",
+ "sha256:5f50a1c177e2fa3ee0667a5ab79fdc6b23086bc8b589d90b93b4bd17eb0e64d1",
+ "sha256:63da2ccc0857c311d764e7d3d90f429c252e83b52d1f8f1d1fe55be26827d1f4",
+ "sha256:6749649eecd6a9871cae297bffa4ee76f90b4504a2a2ab528d9ebe912b101975",
+ "sha256:6804daeb7ef69e7b36f76caddb85cccd63d0c56dedb47555d2fc969e2af6a1a5",
+ "sha256:689bb688a1db722485e4610a503e3e9210dcc20c520b45ac8f7533c837be76fe",
+ "sha256:699a9af7dffaf67deeae27b2112aa06b41c370d5e7633e0ee0aea2e0b6c211f7",
+ "sha256:6b418afe5df18233fc6b6093deb82a32895b6bb0b1155c2cdb05203f583053f1",
+ "sha256:76cf573e5a365e790396a5cc2b909812633409306c6531a6877c59061e42c4f2",
+ "sha256:7b515674acfdcadb0eb5d00d8a709868173acece5cb0be3dd165950cbfdf5409",
+ "sha256:7b770ed79542ed52c519119473898198761d78beb24b107acf3ad65deae61f1f",
+ "sha256:7d2278d59425777cfcb19735018d897ca8303abe67cc735f9f97177ceff8027f",
+ "sha256:7e91ee82f4199af8c43d8158024cbdff3d931df350252288f0d4ce656df7f3b5",
+ "sha256:821b7f59b99551c69c85a6039c65b75f5683bdc63270fec660f75da67469ca24",
+ "sha256:822068f85e12a6e292803e112ab876bc03ed1f03dddb80154c395f891ca6b31e",
+ "sha256:8340225bd5e7a701c0fa98284c849c9b9fc9238abf53a0ebd90900f25d39a4e4",
+ "sha256:85cabf64adec449132e55616e7ca3e1000ab449d1d0f9d7f83146ed5bdcb6d8a",
+ "sha256:880bbbcbe2fca64e2f4d8e04db47bcdf504936fa2b33933efd945e1b429bea8c",
+ "sha256:8d0b4612b66ff5d62d03bcaa043bb018f74dfea51184e53f067e6fdcba4bd8de",
+ "sha256:8e20cb5a47247e383cf4ff523205060991021233ebd6f924bca927fcf25cf86f",
+ "sha256:925073b2fe14ab9b87e73f9a5fde6ce6392da430f3004d8b72cc86f746f5163b",
+ "sha256:998c7c41910666d2976928c38ea96a70d1aa43be6fe502f21a651e17483a43c5",
+ "sha256:9b22c5c66f67ae00c0199f6055705bc3eb3fcb08d03d2ec4059a2b1b25ed48d7",
+ "sha256:9f102706d0ca011de571de32c3247c6476b55bb6bc65a20f682f000b07a4852a",
+ "sha256:a08cff61517ee26cb56f1e949cca38caabe9ea9fbb4b1e10a805dc39844b7d5c",
+ "sha256:a0a336d6d3e8b234a3aae3c674873d8f0e720b76bc1d9416866c41cd9500ffb9",
+ "sha256:a35f8b7fa99f90dd2f5dc5a9fa12332642f087a7641289ca6c40d6e1a2637d8e",
+ "sha256:a38486985ca49cfa574a507e7a2215c0c780fd1778bb6290c21193b7211702ab",
+ "sha256:a5da296eb617d18e497bcf0a5c528f5d3b18dadb3619fbdadf4ed2356ef8d941",
+ "sha256:a6e441a86553c310258aca15d1c05903aaf4965b23f3bc2d55f200804e005ee5",
+ "sha256:a82d05da00a58b8e4c0008edbc8a4b6ec5a4bc1e2ee0fb6ed157cf634ed7fa45",
+ "sha256:ab323679b8b3030000f2be63e22cdeea5b47ee0abd2d6a1dc0c8103ddaa56cd7",
+ "sha256:b1f42b6921d0e81b1bcb5e395bc091a70f41c4d4e55ba99c6da2b31626c44892",
+ "sha256:b23e19989c355ca854276178a0463951a653309fb8e57ce674497f2d9f208746",
+ "sha256:b264171e3143d842ded311b7dccd46ff9ef34247129ff5bf5066123c55c2431c",
+ "sha256:b26a29f0b7fc6f0897f043ca366142d2b609dc60756ee6e4e90b5f762c6adc53",
+ "sha256:b64d891da92e232c36976c80ed7ebb383e3f148489796d8d31a5b6a677825efe",
+ "sha256:b9cc34af337a97d470040f99ba4282f6e6bac88407d021688a5d585e44a23184",
+ "sha256:bc718cd47b765e790eecb74d044cc8d37d58562f6c314ee9484df26276d36a38",
+ "sha256:be7292c55101e22f2a3d4d8913944cbea71eea90792bf914add27454a13905df",
+ "sha256:c83203addf554215463b59f6399835201999b5e48019dc17f182ed5ad87205c9",
+ "sha256:c9ec3eaf616d67db0764b3bb983962b4f385a1f08304fd30c7283954e6a7869b",
+ "sha256:ca34efc80a29351897e18888c71c6aca4a359247c87e0b1c7ada14f0ab0c0fb2",
+ "sha256:ca989b91cf3a3ba28930a9fc1e9aeafc2a395448641df1f387a2d394638943b0",
+ "sha256:d02a5399126a53492415d4906ab0ad0375a5456cc05c3fc0fc4ca11771745cda",
+ "sha256:d17bc7c2ccf49c478c5bdd447594e82692c74222698cfc9b5daae7ae7e90743b",
+ "sha256:d5bf6545cd27aaa8a13033ce56354ed9e25ab0e4ac3b5392b763d8d04b08e0c5",
+ "sha256:d6b430a9938a5a5d85fc107d852262ddcd48602c120e3dbb02137c83d212b380",
+ "sha256:da248f93f0418a9e9d94b0080d7ebc407a9a5e6d0b57bb30db9b5cc28de1ad33",
+ "sha256:da4dd7c9c50c059aba52b3524f84d7de956f7fef88f0bafcf4ad7dde94a064e8",
+ "sha256:df0623dcf9668ad0445e0558a21211d4e9a149ea8f5666917c8eeec515f0a6d1",
+ "sha256:e5168986b90a8d1f2f9dc1b841467c74221bd752537b99761a93d2d981e04889",
+ "sha256:efa29c2fe6b4fdd32e8ef81c1528506895eca86e1d8c4657fda04c9b3786ddf9",
+ "sha256:f1496ea22ca2c830cbcbd473de8f114a320da308438ae65abad6bab7867fe38f",
+ "sha256:f49e52d174375a7def9915c9f06ec4e569d235ad428f70751765f48d5926678c"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==4.9.2"
},
"markupsafe": {
"hashes": [
- "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473",
- "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
- "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
- "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
- "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
- "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
- "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
- "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
- "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
- "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
- "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
- "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
- "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
- "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
- "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905",
- "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735",
- "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d",
- "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e",
- "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d",
- "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c",
- "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21",
- "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2",
- "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5",
- "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b",
- "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
- "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
- "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
- "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"
- ],
- "version": "==1.1.1"
+ "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
+ "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
+ "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
+ "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
+ "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
+ "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
+ "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
+ "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
+ "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
+ "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
+ "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
+ "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
+ "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
+ "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
+ "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
+ "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
+ "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
+ "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
+ "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
+ "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
+ "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
+ "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
+ "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
+ "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
+ "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
+ "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
+ "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
+ "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
+ "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
+ "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
+ "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
+ "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
+ "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
+ "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
+ "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
+ "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
+ "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
+ "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
+ "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
+ "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.1.1"
},
"minio": {
"hashes": [
- "sha256:8f7ba1ca0750dfca3302cb03b14a92bf5f1c755ff84f9ba268079bf582e0f735",
- "sha256:cac56a950217b989daa460e8532ce5c68635a1bd1d54b8ab4e4285659cb8fa92",
- "sha256:dfb602d8a00944b562a40d43a3fd80cdf707277a73b450ebbacc3cabe20fbb21"
+ "sha256:7cb075b56bac894551304cb824f958069a84e0dd2d0a685f9bed3c05e15727bf",
+ "sha256:acae9bfae0aec1b92025bd63e18135ebb4994c84600716c5323e14cb0c9a0b03",
+ "sha256:eec4ab073ff979c34e928e532d8acc1d40d61ba4404709cf27ab3ecdcfa2a561"
],
"index": "ia",
- "version": "==5.0.1"
+ "version": "==6.0.2"
+ },
+ "perfstat": {
+ "hashes": [
+ "sha256:4f91fab9be6076972c66fe818eed488be28f1044009237adccce42ff2c7861f5"
+ ],
+ "version": "==0.1.0.1"
},
"pillow": {
"hashes": [
@@ -373,6 +642,7 @@
"sha256:412f4999794a80c9153cd2156f040b8e570b145d2edf5830854578ffb0b27cac",
"sha256:4678857a6dd0834a77ad6b5eb75a6d79753aa1a13f54f1c47fdb1e9bca63f389",
"sha256:4756b77682c7335751a2cfc0e9e6d96945d88ffd315420d9010235021ddfc64c",
+ "sha256:482feca2305feef9d5c38bae66734c64d7d3e649e5b8e01115894ad6d399bad1",
"sha256:486f4ccddee09429cb1c63ea56c02894aecf9d69acdcaf006c53835df2549fff",
"sha256:502275e7a83872e62e0cd0be4da575a53a1f9703341aba814527dacdda3660a8",
"sha256:520dfe2ed09ea90a82d6876e87e82c82ba390d2b2936a95d8e9997eca281546f",
@@ -380,6 +650,7 @@
"sha256:5cda8efe9e0849858986c06cb068ac4de0933780f84fa989d6dae2a85c304d2b",
"sha256:61cb7e1ff212293d74155397fad008b052bc9633efdf9caa4271c316cd25b99f",
"sha256:68e45ed91531d3b05a17a356fd4cb928603a00259916e057730a024b029d6b51",
+ "sha256:699ad2e1a865433f89c7be40fed71d2497da525dc9938218ac3f222a464ea32a",
"sha256:69c3cc797a66241ed2fa61ff6f52c73e7bd3e738d80d64abb3500e6fbdec30a3",
"sha256:69ec2f0effe8b395e55929bb4d9a3ca8ff0c40f85d61d00e1e5b1e504d28b5e2",
"sha256:6a06f165dcec5789fd98a5d4fe542619ffd3b86b9bf616d1a54d824e9428c6d3",
@@ -407,138 +678,431 @@
"sha256:f040b4709cba8922f60de441684b3d061fedb61c6ca50d231df8a4d55e45943c",
"sha256:f336019509df1a042b7d6bed69a0cb6c52108b6327ce936c2870145dc18f1394"
],
+ "index": "ia",
"version": "==3.1.1"
},
"psycopg2": {
"hashes": [
- "sha256:128d0fa910ada0157bba1cb74a9c5f92bb8a1dca77cf91a31eb274d1f889e001",
- "sha256:1eee1b871d4ac810a0e0e7d46e20dcad0dabe48d74a68ad38f429d15e14a0fb4",
- "sha256:227fd46cf9b7255f07687e5bde454d7d67ae39ca77e170097cdef8ebfc30c323",
- "sha256:2315e7f104681d498ccf6fd70b0dba5bce65d60ac92171492bfe228e21dcc242",
- "sha256:4b5417dcd2999db0f5a891d54717cfaee33acc64f4772c4bc574d4ff95ed9d80",
- "sha256:640113ddc943522aaf71294e3f2d24013b0edd659b7820621492c9ebd3a2fb0b",
- "sha256:897a6e838319b4bf648a574afb6cabcb17d0488f8c7195100d48d872419f4457",
- "sha256:8dceca81409898c870e011c71179454962dec152a1a6b86a347f4be74b16d864",
- "sha256:99a41fb03ce51826aced23e85b432f45163d80bf72a74c17dd4055ed6313178d",
- "sha256:a6a1ca3a983c5600af87f039e8a86b5f191da0e335bfacefd2bcc27c76997dd6",
- "sha256:b1b8e41da09a0c3ef0b3d4bb72da0dde2abebe583c1e8462973233fd5ad0235f",
- "sha256:c3993c3ad35ca732e35f0ad616dacde07d3df390e8e0a09bd02ffccde562d063",
- "sha256:cb407fccc12fc29dc331f2b934913405fa49b9b75af4f3a72d0f50f57ad2ca23",
- "sha256:d3a27550a8185e53b244ad7e79e307594b92fede8617d80200a8cce1fba2c60f",
- "sha256:f0e6b697a975d9d3ccd04135316c947dd82d841067c7800ccf622a8717e98df1"
+ "sha256:093e3894d2d3c592ab0945d9eba9d139c139664dcf83a1c440b8a7aa9bb21955",
+ "sha256:190d51e8c1b25a47484e52a79638a8182451d6f6dff99f26ad9bd81e5359a0fa",
+ "sha256:1a5c7d7d577e0eabfcf15eb87d1e19314c8c4f0e722a301f98e0e3a65e238b4e",
+ "sha256:1e5a38aa85bd660c53947bd28aeaafb6a97d70423606f1ccb044a03a1203fe4a",
+ "sha256:322fd5fca0b1113677089d4ebd5222c964b1760e361f151cbb2706c4912112c5",
+ "sha256:4cb9936316d88bfab614666eb9e32995e794ed0f8f6b3b718666c22819c1d7ee",
+ "sha256:920bf418000dd17669d2904472efeab2b20546efd0548139618f8fa305d1d7ad",
+ "sha256:922cc5f0b98a5f2b1ff481f5551b95cd04580fd6f0c72d9b22e6c0145a4840e0",
+ "sha256:a5246d2e683a972e2187a8714b5c2cf8156c064629f9a9b1a873c1730d9e245a",
+ "sha256:b9ac1b0d8ecc49e05e4e182694f418d27f3aedcfca854ebd6c05bb1cffa10d6d",
+ "sha256:d3ef67e630b0de0779c42912fe2cbae3805ebaba30cda27fea2a3de650a9414f",
+ "sha256:f5b6320dbc3cf6cfb9f25308286f9f7ab464e65cfb105b64cc9c52831748ced2",
+ "sha256:fc04dd5189b90d825509caa510f20d1d504761e78b8dfb95a0ede180f71d50e5"
],
"index": "ia",
- "version": "==2.8.3"
+ "version": "==2.9.5"
},
"publicsuffix": {
"hashes": [
- "sha256:99a3a06d6eb19c57057d17560908b757995396ad76e6513c9d17e6a7a1266c91",
- "sha256:ae77593d269e1e5131723259cc1142c25690c20c59f2e98f67e227228028bda9",
- "sha256:eeb90d6cb0ae26d3af43f4d53f4c5eb6cfa437ad16a73c06c6caabb8f36ae1e5"
+ "sha256:22ce1d65ab6af5e9b2122e2443facdb93fb5c4abf24138099cb10fe7989f43b6"
],
- "version": "==1.1.0"
+ "version": "==1.1.1"
},
- "pykafka": {
- "hashes": [
- "sha256:6b075909a52cb0c95325bc16ab797bbcdbb37386652ea460705ed4472ce91459",
- "sha256:f0bbd394ae6970042a587c99fe4dc0966e67787249d963d4ce2f810dc9490577"
+ "pydantic": {
+ "hashes": [
+ "sha256:05e00dbebbe810b33c7a7362f231893183bcc4251f3f2ff991c31d5c08240c42",
+ "sha256:06094d18dd5e6f2bbf93efa54991c3240964bb663b87729ac340eb5014310624",
+ "sha256:0b959f4d8211fc964772b595ebb25f7652da3f22322c007b6fed26846a40685e",
+ "sha256:19b3b9ccf97af2b7519c42032441a891a5e05c68368f40865a90eb88833c2559",
+ "sha256:1b6ee725bd6e83ec78b1aa32c5b1fa67a3a65badddde3976bca5fe4568f27709",
+ "sha256:1ee433e274268a4b0c8fde7ad9d58ecba12b069a033ecc4645bb6303c062d2e9",
+ "sha256:216f3bcbf19c726b1cc22b099dd409aa371f55c08800bcea4c44c8f74b73478d",
+ "sha256:2d0567e60eb01bccda3a4df01df677adf6b437958d35c12a3ac3e0f078b0ee52",
+ "sha256:2e05aed07fa02231dbf03d0adb1be1d79cabb09025dd45aa094aa8b4e7b9dcda",
+ "sha256:352aedb1d71b8b0736c6d56ad2bd34c6982720644b0624462059ab29bd6e5912",
+ "sha256:355639d9afc76bcb9b0c3000ddcd08472ae75318a6eb67a15866b87e2efa168c",
+ "sha256:37c90345ec7dd2f1bcef82ce49b6235b40f282b94d3eec47e801baf864d15525",
+ "sha256:4b8795290deaae348c4eba0cebb196e1c6b98bdbe7f50b2d0d9a4a99716342fe",
+ "sha256:5760e164b807a48a8f25f8aa1a6d857e6ce62e7ec83ea5d5c5a802eac81bad41",
+ "sha256:6eb843dcc411b6a2237a694f5e1d649fc66c6064d02b204a7e9d194dff81eb4b",
+ "sha256:7b5ba54d026c2bd2cb769d3468885f23f43710f651688e91f5fb1edcf0ee9283",
+ "sha256:7c2abc4393dea97a4ccbb4ec7d8658d4e22c4765b7b9b9445588f16c71ad9965",
+ "sha256:81a7b66c3f499108b448f3f004801fcd7d7165fb4200acb03f1c2402da73ce4c",
+ "sha256:91b8e218852ef6007c2b98cd861601c6a09f1aa32bbbb74fab5b1c33d4a1e410",
+ "sha256:9300fcbebf85f6339a02c6994b2eb3ff1b9c8c14f502058b5bf349d42447dcf5",
+ "sha256:9cabf4a7f05a776e7793e72793cd92cc865ea0e83a819f9ae4ecccb1b8aa6116",
+ "sha256:a1f5a63a6dfe19d719b1b6e6106561869d2efaca6167f84f5ab9347887d78b98",
+ "sha256:a4c805731c33a8db4b6ace45ce440c4ef5336e712508b4d9e1aafa617dc9907f",
+ "sha256:ae544c47bec47a86bc7d350f965d8b15540e27e5aa4f55170ac6a75e5f73b644",
+ "sha256:b97890e56a694486f772d36efd2ba31612739bc6f3caeee50e9e7e3ebd2fdd13",
+ "sha256:bb6ad4489af1bac6955d38ebcb95079a836af31e4c4f74aba1ca05bb9f6027bd",
+ "sha256:bedf309630209e78582ffacda64a21f96f3ed2e51fbf3962d4d488e503420254",
+ "sha256:c1ba1afb396148bbc70e9eaa8c06c1716fdddabaf86e7027c5988bae2a829ab6",
+ "sha256:c33602f93bfb67779f9c507e4d69451664524389546bacfe1bee13cae6dc7488",
+ "sha256:c4aac8e7103bf598373208f6299fa9a5cfd1fc571f2d40bf1dd1955a63d6eeb5",
+ "sha256:c6f981882aea41e021f72779ce2a4e87267458cc4d39ea990729e21ef18f0f8c",
+ "sha256:cc78cc83110d2f275ec1970e7a831f4e371ee92405332ebfe9860a715f8336e1",
+ "sha256:d49f3db871575e0426b12e2f32fdb25e579dea16486a26e5a0474af87cb1ab0a",
+ "sha256:dd3f9a40c16daf323cf913593083698caee97df2804aa36c4b3175d5ac1b92a2",
+ "sha256:e0bedafe4bc165ad0a56ac0bd7695df25c50f76961da29c050712596cf092d6d",
+ "sha256:e9069e1b01525a96e6ff49e25876d90d5a563bc31c658289a8772ae186552236"
],
"index": "ia",
- "version": "==2.8.0"
+ "version": "==1.10.2"
},
"pylru": {
"hashes": [
- "sha256:492f934bb98dc6c8b2370c02c95c65516ddc08c8f64d27f70087eb038621d297"
+ "sha256:47ad140a63ab9389648dadfbb4330700e0ffeeb28ec04664ee47d37ed133b0f4",
+ "sha256:b7c75b0676e2fbae647823bc209e23998772867d3679f1583c7350a9b02a59f0"
+ ],
+ "version": "==1.2.1"
+ },
+ "pymupdf": {
+ "hashes": [
+ "sha256:05c54acf69ee55ef97453f9c52982ef2839c188fe464d6b4cdc053bd4c6298f1",
+ "sha256:11b913664c059146e512e8559ebd9f976570ef21c0338c953836bc02051c1d7e",
+ "sha256:13ed689e5ad4c3adecb7586050de8baaa1819f48e2c57ca4e87f80e3b2727cb3",
+ "sha256:164dc67f1f5db3b22207b2aeba0fadff0503123c8f31c46768b7da7d3595a181",
+ "sha256:1e7b85e2611a9cca7a410e4c5a510a11131de7c5da9379e46615a8d3adfa6df5",
+ "sha256:38188f88a6e648b9f3a87d29de5b4ed52f910827a15859b183f1321c68e6ac00",
+ "sha256:39192c009afd8dd877a79ed02519ec8d17699bec9e9543115e490f06a553e200",
+ "sha256:4c5e7211b85e13050ac6e25879d4f0476b7a04f23bd3b6442489cec9f8da8418",
+ "sha256:7281324a0325dd30c033644cc8654167dcbfe47c4b1d49805d407fa5a64ce76b",
+ "sha256:909fb46900e7422515291761a1294902cf163226ec8918ea4c3454537336dfeb",
+ "sha256:945529b7868f9fe290b11dfbc37e2b9012610fac9763686ccf91a4d968305c5e",
+ "sha256:976fb0e93f025617890f8f8d8517371684131aa0e9fc0c1d0b4cd8bd564cce27",
+ "sha256:9998f7dfa0f99d6c2c3eb0dcfbfd44433247c23c4b781bc45f76dab421bc554b",
+ "sha256:a3b8e5c2de6192c89f379283aa07aa7fd044098dab43a8cd3ac172e961caf286",
+ "sha256:b0db8c81b6c781e373ed005f7595e49b760f91edb3b36d1dc69ec29b4fad34f8",
+ "sha256:c03004415a6d140b2c4bb494bb507c9ccbd55d713407e3b5bc1dd35fa45f2be0",
+ "sha256:cfd6c666b02a066e9e76d9ce8ca5e7fa4f2bf7a8ce6934cd2837b08509d46f8e",
+ "sha256:dffe67c5574d0ebb1e39b5ecf806fb4fd4ddb01bee5630f516ece4468252c9f0",
+ "sha256:ef3d13e27f1585d776f6a2597f113aabd28d36b648b983a72850b21c5399ab08",
+ "sha256:f04086036d40af50e5d6f54e949fa12eacda2d752562a2f85215763b137bf864",
+ "sha256:f3f96bd465e9e0e2960bb70e92233af0865181b9dd8ac5bc6b159d79584df2fe"
],
- "version": "==1.2.0"
+ "index": "ia",
+ "version": "==1.19.6"
},
- "pymysql": {
+ "python-dateutil": {
"hashes": [
- "sha256:3943fbbbc1e902f41daf7f9165519f140c4451c179380677e6a848587042561a",
- "sha256:d8c059dcd81dedb85a9f034d5e22dcb4442c0b201908bede99e306d65ea7c8e7"
+ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
+ "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
],
- "version": "==0.9.3"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==2.8.2"
},
- "python-dateutil": {
+ "python-magic": {
"hashes": [
- "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
- "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
+ "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b",
+ "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"
],
- "version": "==2.8.0"
+ "index": "ia",
+ "version": "==0.4.27"
},
- "python-magic": {
+ "python-poppler": {
"hashes": [
- "sha256:f2674dcfad52ae6c49d4803fa027809540b130db1dec928cfbb9240316831375",
- "sha256:f3765c0f582d2dfc72c15f3b5a82aecfae9498bd29ca840d72f37d7bd38bfcd5"
+ "sha256:8b6a157e51cbb4c08353a21ca3f6f396558759cdfb0b80071379ad89d5f7c533"
],
"index": "ia",
- "version": "==0.4.15"
+ "version": "==0.3.0"
},
"python-snappy": {
"hashes": [
- "sha256:9c0ba725755b749ef9b03f6ed7582cefb957c0d9f6f064a7c4314148a9dbdb61",
- "sha256:a745b3732750e2e627adf45fe2669b18afb4170431b0d100da041f807bdea0c8",
- "sha256:ac48ec6146d71627bba0fe4857984ac1f3f70a35c12eed0f91b46f353952d5fa",
- "sha256:d9c26532cfa510f45e8d135cde140e8a5603d3fb254cfec273ebc0ecf9f668e2",
- "sha256:f21e8472a7f11b65b4bb5aea1c12624e2d4199aa586c57a11faa0de86a3053a6"
+ "sha256:03bb511380fca2a13325b6f16fe8234c8e12da9660f0258cd45d9a02ffc916af",
+ "sha256:0bdb6942180660bda7f7d01f4c0def3cfc72b1c6d99aad964801775a3e379aba",
+ "sha256:0d489b50f49433494160c45048fe806de6b3aeab0586e497ebd22a0bab56e427",
+ "sha256:1a993dc8aadd901915a510fe6af5f20ae4256f527040066c22a154db8946751f",
+ "sha256:1d029f7051ec1bbeaa3e03030b6d8ed47ceb69cae9016f493c802a08af54e026",
+ "sha256:277757d5dad4e239dc1417438a0871b65b1b155beb108888e7438c27ffc6a8cc",
+ "sha256:2a7e528ab6e09c0d67dcb61a1730a292683e5ff9bb088950638d3170cf2a0a54",
+ "sha256:2aaaf618c68d8c9daebc23a20436bd01b09ee70d7fbf7072b7f38b06d2fab539",
+ "sha256:2be4f4550acd484912441f5f1209ba611ac399aac9355fee73611b9a0d4f949c",
+ "sha256:39692bedbe0b717001a99915ac0eb2d9d0bad546440d392a2042b96d813eede1",
+ "sha256:3fb9a88a4dd6336488f3de67ce75816d0d796dce53c2c6e4d70e0b565633c7fd",
+ "sha256:4038019b1bcaadde726a57430718394076c5a21545ebc5badad2c045a09546cf",
+ "sha256:463fd340a499d47b26ca42d2f36a639188738f6e2098c6dbf80aef0e60f461e1",
+ "sha256:4d3cafdf454354a621c8ab7408e45aa4e9d5c0b943b61ff4815f71ca6bdf0130",
+ "sha256:4ec533a8c1f8df797bded662ec3e494d225b37855bb63eb0d75464a07947477c",
+ "sha256:530bfb9efebcc1aab8bb4ebcbd92b54477eed11f6cf499355e882970a6d3aa7d",
+ "sha256:546c1a7470ecbf6239101e9aff0f709b68ca0f0268b34d9023019a55baa1f7c6",
+ "sha256:5843feb914796b1f0405ccf31ea0fb51034ceb65a7588edfd5a8250cb369e3b2",
+ "sha256:586724a0276d7a6083a17259d0b51622e492289a9998848a1b01b6441ca12b2f",
+ "sha256:59e975be4206cc54d0a112ef72fa3970a57c2b1bcc2c97ed41d6df0ebe518228",
+ "sha256:5a453c45178d7864c1bdd6bfe0ee3ed2883f63b9ba2c9bb967c6b586bf763f96",
+ "sha256:5bb05c28298803a74add08ba496879242ef159c75bc86a5406fac0ffc7dd021b",
+ "sha256:5e973e637112391f05581f427659c05b30b6843bc522a65be35ac7b18ce3dedd",
+ "sha256:66c80e9b366012dbee262bb1869e4fc5ba8786cda85928481528bc4a72ec2ee8",
+ "sha256:6a7620404da966f637b9ce8d4d3d543d363223f7a12452a575189c5355fc2d25",
+ "sha256:6f8bf4708a11b47517baf962f9a02196478bbb10fdb9582add4aa1459fa82380",
+ "sha256:735cd4528c55dbe4516d6d2b403331a99fc304f8feded8ae887cf97b67d589bb",
+ "sha256:7778c224efc38a40d274da4eb82a04cac27aae20012372a7db3c4bbd8926c4d4",
+ "sha256:8277d1f6282463c40761f802b742f833f9f2449fcdbb20a96579aa05c8feb614",
+ "sha256:88b6ea78b83d2796f330b0af1b70cdd3965dbdab02d8ac293260ec2c8fe340ee",
+ "sha256:8c07220408d3268e8268c9351c5c08041bc6f8c6172e59d398b71020df108541",
+ "sha256:8d0c019ee7dcf2c60e240877107cddbd95a5b1081787579bf179938392d66480",
+ "sha256:90b0186516b7a101c14764b0c25931b741fb0102f21253eff67847b4742dfc72",
+ "sha256:9837ac1650cc68d22a3cf5f15fb62c6964747d16cecc8b22431f113d6e39555d",
+ "sha256:9eac51307c6a1a38d5f86ebabc26a889fddf20cbba7a116ccb54ba1446601d5b",
+ "sha256:9f0c0d88b84259f93c3aa46398680646f2c23e43394779758d9f739c34e15295",
+ "sha256:a0ad38bc98d0b0497a0b0dbc29409bcabfcecff4511ed7063403c86de16927bc",
+ "sha256:b265cde49774752aec9ca7f5d272e3f98718164afc85521622a8a5394158a2b5",
+ "sha256:b6a107ab06206acc5359d4c5632bd9b22d448702a79b3169b0c62e0fb808bb2a",
+ "sha256:b7f920eaf46ebf41bd26f9df51c160d40f9e00b7b48471c3438cb8d027f7fb9b",
+ "sha256:c20498bd712b6e31a4402e1d027a1cd64f6a4a0066a3fe3c7344475886d07fdf",
+ "sha256:cb18d9cd7b3f35a2f5af47bb8ed6a5bdbf4f3ddee37f3daade4ab7864c292f5b",
+ "sha256:cf5bb9254e1c38aacf253d510d3d9be631bba21f3d068b17672b38b5cbf2fff5",
+ "sha256:d017775851a778ec9cc32651c4464079d06d927303c2dde9ae9830ccf6fe94e1",
+ "sha256:dc96668d9c7cc656609764275c5f8da58ef56d89bdd6810f6923d36497468ff7",
+ "sha256:e066a0586833d610c4bbddba0be5ba0e3e4f8e0bc5bb6d82103d8f8fc47bb59a",
+ "sha256:e3a013895c64352b49d0d8e107a84f99631b16dbab156ded33ebf0becf56c8b2",
+ "sha256:eaf905a580f2747c4a474040a5063cd5e0cc3d1d2d6edb65f28196186493ad4a"
],
"index": "ia",
- "version": "==0.5.4"
+ "version": "==0.6.1"
},
"pytz": {
"hashes": [
- "sha256:26c0b32e437e54a18161324a2fca3c4b9846b74a8dccddd843113109e1116b32",
- "sha256:c894d57500a4cd2d5c71114aaab77dbab5eabd9022308ce5ac9bb93a60a6f0c7"
+ "sha256:7ccfae7b4b2c067464a6733c6261673fdb8fd1be905460396b97a073e9fa683a",
+ "sha256:93007def75ae22f7cd991c84e02d434876818661f8df9ad5df9e950ff4e52cfd"
],
- "version": "==2019.2"
+ "version": "==2022.7"
},
- "pyyaml": {
+ "pytz-deprecation-shim": {
"hashes": [
- "sha256:0113bc0ec2ad727182326b61326afa3d1d8280ae1122493553fd6f4397f33df9",
- "sha256:01adf0b6c6f61bd11af6e10ca52b7d4057dd0be0343eb9283c878cf3af56aee4",
- "sha256:5124373960b0b3f4aa7df1707e63e9f109b5263eca5976c66e08b1c552d4eaf8",
- "sha256:5ca4f10adbddae56d824b2c09668e91219bb178a1eee1faa56af6f99f11bf696",
- "sha256:7907be34ffa3c5a32b60b95f4d95ea25361c951383a894fec31be7252b2b6f34",
- "sha256:7ec9b2a4ed5cad025c2278a1e6a19c011c80a3caaac804fd2d329e9cc2c287c9",
- "sha256:87ae4c829bb25b9fe99cf71fbb2140c448f534e24c998cc60f39ae4f94396a73",
- "sha256:9de9919becc9cc2ff03637872a440195ac4241c80536632fffeb6a1e25a74299",
- "sha256:a5a85b10e450c66b49f98846937e8cfca1db3127a9d5d1e31ca45c3d0bef4c5b",
- "sha256:b0997827b4f6a7c286c01c5f60384d218dca4ed7d9efa945c3e1aa623d5709ae",
- "sha256:b631ef96d3222e62861443cc89d6563ba3eeb816eeb96b2629345ab795e53681",
- "sha256:bf47c0607522fdbca6c9e817a6e81b08491de50f3766a7a0e6a5be7905961b41",
- "sha256:f81025eddd0327c7d4cfe9b62cf33190e1e736cc6e97502b3ec425f574b3e7a8"
+ "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
+ "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
],
- "version": "==5.1.2"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "version": "==0.1.0.post0"
},
- "raven": {
+ "pyyaml": {
"hashes": [
- "sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54",
- "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
- ],
- "index": "ia",
- "version": "==6.10.0"
+ "sha256:06a0d7ba600ce0b2d2fe2e78453a470b5a6e000a985dd4a4e54e436cc36b0e97",
+ "sha256:240097ff019d7c70a4922b6869d8a86407758333f02203e0fc6ff79c5dcede76",
+ "sha256:4f4b913ca1a7319b33cfb1369e91e50354d6f07a135f3b901aca02aa95940bd2",
+ "sha256:6034f55dab5fea9e53f436aa68fa3ace2634918e8b5994d82f3621c04ff5ed2e",
+ "sha256:69f00dca373f240f842b2931fb2c7e14ddbacd1397d57157a9b005a6a9942648",
+ "sha256:73f099454b799e05e5ab51423c7bcf361c58d3206fa7b0d555426b1f4d9a3eaf",
+ "sha256:74809a57b329d6cc0fdccee6318f44b9b8649961fa73144a98735b0aaf029f1f",
+ "sha256:7739fc0fa8205b3ee8808aea45e968bc90082c10aef6ea95e855e10abf4a37b2",
+ "sha256:95f71d2af0ff4227885f7a6605c37fd53d3a106fcab511b8860ecca9fcf400ee",
+ "sha256:ad9c67312c84def58f3c04504727ca879cb0013b2517c85a9a253f0cb6380c0a",
+ "sha256:b8eac752c5e14d3eca0e6dd9199cd627518cb5ec06add0de9d32baeee6fe645d",
+ "sha256:cc8955cfbfc7a115fa81d85284ee61147059a753344bc51098f3ccd69b0d7e0c",
+ "sha256:d13155f591e6fcc1ec3b30685d50bf0711574e2c0dfffd7644babf8b5102ca1a"
+ ],
+ "version": "==5.3.1"
+ },
+ "rapidfuzz": {
+ "hashes": [
+ "sha256:020858dd89b60ce38811cd6e37875c4c3c8d7fcd8bc20a0ad2ed1f464b34dc4e",
+ "sha256:042644133244bfa7b20de635d500eb9f46af7097f3d90b1724f94866f17cb55e",
+ "sha256:08590905a95ccfa43f4df353dcc5d28c15d70664299c64abcad8721d89adce4f",
+ "sha256:114810491efb25464016fd554fdf1e20d390309cecef62587494fc474d4b926f",
+ "sha256:1333fb3d603d6b1040e365dca4892ba72c7e896df77a54eae27dc07db90906e3",
+ "sha256:16080c05a63d6042643ae9b6cfec1aefd3e61cef53d0abe0df3069b9d4b72077",
+ "sha256:16ffad751f43ab61001187b3fb4a9447ec2d1aedeff7c5bac86d3b95f9980cc3",
+ "sha256:1f50d1227e6e2a0e3ae1fb1c9a2e1c59577d3051af72c7cab2bcc430cb5e18da",
+ "sha256:1fbad8fb28d98980f5bff33c7842efef0315d42f0cd59082108482a7e6b61410",
+ "sha256:23524635840500ce6f4d25005c9529a97621689c85d2f727c52eed1782839a6a",
+ "sha256:24d3fea10680d085fd0a4d76e581bfb2b1074e66e78fd5964d4559e1fcd2a2d4",
+ "sha256:24eb6b843492bdc63c79ee4b2f104059b7a2201fef17f25177f585d3be03405a",
+ "sha256:25b4cedf2aa19fb7212894ce5f5219010cce611b60350e9a0a4d492122e7b351",
+ "sha256:27be9c63215d302ede7d654142a2e21f0d34ea6acba512a4ae4cfd52bbaa5b59",
+ "sha256:2c836f0f2d33d4614c3fbaf9a1eb5407c0fe23f8876f47fd15b90f78daa64c34",
+ "sha256:3a9bd02e1679c0fd2ecf69b72d0652dbe2a9844eaf04a36ddf4adfbd70010e95",
+ "sha256:3d8b081988d0a49c486e4e845a547565fee7c6e7ad8be57ff29c3d7c14c6894c",
+ "sha256:3dcffe1f3cbda0dc32133a2ae2255526561ca594f15f9644384549037b355245",
+ "sha256:3f11a7eff7bc6301cd6a5d43f309e22a815af07e1f08eeb2182892fca04c86cb",
+ "sha256:42085d4b154a8232767de8296ac39c8af5bccee6b823b0507de35f51c9cbc2d7",
+ "sha256:424f82c35dbe4f83bdc3b490d7d696a1dc6423b3d911460f5493b7ffae999fd2",
+ "sha256:43fb8cb030f888c3f076d40d428ed5eb4331f5dd6cf1796cfa39c67bf0f0fc1e",
+ "sha256:460853983ab88f873173e27cc601c5276d469388e6ad6e08c4fd57b2a86f1064",
+ "sha256:467c1505362823a5af12b10234cb1c4771ccf124c00e3fc9a43696512bd52293",
+ "sha256:46b9b8aa09998bc48dd800854e8d9b74bc534d7922c1d6e1bbf783e7fa6ac29c",
+ "sha256:53dcae85956853b787c27c1cb06f18bb450e22cf57a4ad3444cf03b8ff31724a",
+ "sha256:585206112c294e335d84de5d5f179c0f932837752d7420e3de21db7fdc476278",
+ "sha256:5ada0a14c67452358c1ee52ad14b80517a87b944897aaec3e875279371a9cb96",
+ "sha256:5e2b3d020219baa75f82a4e24b7c8adcb598c62f0e54e763c39361a9e5bad510",
+ "sha256:6120f2995f5154057454c5de99d86b4ef3b38397899b5da1265467e8980b2f60",
+ "sha256:68a89bb06d5a331511961f4d3fa7606f8e21237467ba9997cae6f67a1c2c2b9e",
+ "sha256:7496e8779905b02abc0ab4ba2a848e802ab99a6e20756ffc967a0de4900bd3da",
+ "sha256:759a3361711586a29bc753d3d1bdb862983bd9b9f37fbd7f6216c24f7c972554",
+ "sha256:75c45dcd595f8178412367e302fd022860ea025dc4a78b197b35428081ed33d5",
+ "sha256:7d005e058d86f2a968a8d28ca6f2052fab1f124a39035aa0523261d6baf21e1f",
+ "sha256:7f7930adf84301797c3f09c94b9c5a9ed90a9e8b8ed19b41d2384937e0f9f5bd",
+ "sha256:8109e0324d21993d5b2d111742bf5958f3516bf8c59f297c5d1cc25a2342eb66",
+ "sha256:81642a24798851b118f82884205fc1bd9ff70b655c04018c467824b6ecc1fabc",
+ "sha256:8450d15f7765482e86ef9be2ad1a05683cd826f59ad236ef7b9fb606464a56aa",
+ "sha256:875d51b3497439a72e2d76183e1cb5468f3f979ab2ddfc1d1f7dde3b1ecfb42f",
+ "sha256:8b477b43ced896301665183a5e0faec0f5aea2373005648da8bdcb3c4b73f280",
+ "sha256:8d3e252d4127c79b4d7c2ae47271636cbaca905c8bb46d80c7930ab906cf4b5c",
+ "sha256:916bc2e6cf492c77ad6deb7bcd088f0ce9c607aaeabc543edeb703e1fbc43e31",
+ "sha256:988f8f6abfba7ee79449f8b50687c174733b079521c3cc121d65ad2d38831846",
+ "sha256:99a84ab9ac9a823e7e93b4414f86344052a5f3e23b23aa365cda01393ad895bd",
+ "sha256:9be02162af0376d64b840f2fc8ee3366794fc149f1e06d095a6a1d42447d97c5",
+ "sha256:a5585189b3d90d81ccd62d4f18530d5ac8972021f0aaaa1ffc6af387ff1dce75",
+ "sha256:ae33a72336059213996fe4baca4e0e4860913905c2efb7c991eab33b95a98a0a",
+ "sha256:af4f7c3c904ca709493eb66ca9080b44190c38e9ecb3b48b96d38825d5672559",
+ "sha256:b20141fa6cee041917801de0bab503447196d372d4c7ee9a03721b0a8edf5337",
+ "sha256:b3210869161a864f3831635bb13d24f4708c0aa7208ef5baac1ac4d46e9b4208",
+ "sha256:b34e8c0e492949ecdd5da46a1cfc856a342e2f0389b379b1a45a3cdcd3176a6e",
+ "sha256:b52ac2626945cd21a2487aeefed794c14ee31514c8ae69b7599170418211e6f6",
+ "sha256:b5dd713a1734574c2850c566ac4286594bacbc2d60b9170b795bee4b68656625",
+ "sha256:b5f705652360d520c2de52bee11100c92f59b3e3daca308ebb150cbc58aecdad",
+ "sha256:b6389c50d8d214c9cd11a77f6d501529cb23279a9c9cafe519a3a4b503b5f72a",
+ "sha256:b6bad92de071cbffa2acd4239c1779f66851b60ffbbda0e4f4e8a2e9b17e7eef",
+ "sha256:b75dd0928ce8e216f88660ab3d5c5ffe990f4dd682fd1709dba29d5dafdde6de",
+ "sha256:c2523f8180ebd9796c18d809e9a19075a1060b1a170fde3799e83db940c1b6d5",
+ "sha256:c31022d9970177f6affc6d5dd757ed22e44a10890212032fabab903fdee3bfe7",
+ "sha256:c36fd260084bb636b9400bb92016c6bd81fd80e59ed47f2466f85eda1fc9f782",
+ "sha256:c3741cb0bf9794783028e8b0cf23dab917fa5e37a6093b94c4c2f805f8e36b9f",
+ "sha256:c3fbe449d869ea4d0909fc9d862007fb39a584fb0b73349a6aab336f0d90eaed",
+ "sha256:c66546e30addb04a16cd864f10f5821272a1bfe6462ee5605613b4f1cb6f7b48",
+ "sha256:c71d9d512b76f05fa00282227c2ae884abb60e09f08b5ca3132b7e7431ac7f0d",
+ "sha256:c8601a66fbfc0052bb7860d2eacd303fcde3c14e87fdde409eceff516d659e77",
+ "sha256:c88adbcb933f6b8612f6c593384bf824e562bb35fc8a0f55fac690ab5b3486e5",
+ "sha256:ca00fafd2756bc9649bf80f1cf72c647dce38635f0695d7ce804bc0f759aa756",
+ "sha256:ca8a23097c1f50e0fdb4de9e427537ca122a18df2eead06ed39c3a0bef6d9d3a",
+ "sha256:cda1e2f66bb4ba7261a0f4c2d052d5d909798fca557cbff68f8a79a87d66a18f",
+ "sha256:cdfc04f7647c29fb48da7a04082c34cdb16f878d3c6d098d62d5715c0ad3000c",
+ "sha256:cf62dacb3f9234f3fddd74e178e6d25c68f2067fde765f1d95f87b1381248f58",
+ "sha256:d00df2e4a81ffa56a6b1ec4d2bc29afdcb7f565e0b8cd3092fece2290c4c7a79",
+ "sha256:d248a109699ce9992304e79c1f8735c82cc4c1386cd8e27027329c0549f248a2",
+ "sha256:d63def9bbc6b35aef4d76dc740301a4185867e8870cbb8719ec9de672212fca8",
+ "sha256:d82f20c0060ffdaadaf642b88ab0aa52365b56dffae812e188e5bdb998043588",
+ "sha256:dbcf5371ea704759fcce772c66a07647751d1f5dbdec7818331c9b31ae996c77",
+ "sha256:e8914dad106dacb0775718e54bf15e528055c4e92fb2677842996f2d52da5069",
+ "sha256:ebe303cd9839af69dd1f7942acaa80b1ba90bacef2e7ded9347fbed4f1654672",
+ "sha256:ec55a81ac2b0f41b8d6fb29aad16e55417036c7563bad5568686931aa4ff08f7",
+ "sha256:effe182767d102cb65dfbbf74192237dbd22d4191928d59415aa7d7c861d8c88",
+ "sha256:f42b82f268689f429def9ecfb86fa65ceea0eaf3fed408b570fe113311bf5ce7",
+ "sha256:f6fe570e20e293eb50491ae14ddeef71a6a7e5f59d7e791393ffa99b13f1f8c2",
+ "sha256:f799d1d6c33d81e983d3682571cc7d993ae7ff772c19b3aabb767039c33f6d1e",
+ "sha256:f891b98f8bc6c9d521785816085e9657212621e93f223917fb8e32f318b2957e",
+ "sha256:fa263135b892686e11d5b84f6a1892523123a00b7e5882eff4fbdabb38667347",
+ "sha256:fa4c598ed77f74ec973247ca776341200b0f93ec3883e34c222907ce72cb92a4",
+ "sha256:fe56659ccadbee97908132135de4b875543353351e0c92e736b7c57aee298b5a",
+ "sha256:fe59a0c21a032024edb0c8e43f5dee5623fef0b65a1e3c1281836d9ce199af3b"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.13.7"
},
"redis": {
"hashes": [
- "sha256:98a22fb750c9b9bb46e75e945dc3f61d0ab30d06117cbb21ff9cd1d315fedd3b",
- "sha256:c504251769031b0dd7dd5cf786050a6050197c6de0d37778c80c08cb04ae8275"
+ "sha256:7b8c87d19c45d3f1271b124858d2a5c13160c4e74d4835e28273400fa34d5228",
+ "sha256:cae3ee5d1f57d8caf534cd8764edf3163c77e073bdd74b6f54a87ffafdc5e7d9"
],
- "version": "==3.3.8"
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
+ },
+ "regex": {
+ "hashes": [
+ "sha256:052b670fafbe30966bbe5d025e90b2a491f85dfe5b2583a163b5e60a85a321ad",
+ "sha256:0653d012b3bf45f194e5e6a41df9258811ac8fc395579fa82958a8b76286bea4",
+ "sha256:0a069c8483466806ab94ea9068c34b200b8bfc66b6762f45a831c4baaa9e8cdd",
+ "sha256:0cf0da36a212978be2c2e2e2d04bdff46f850108fccc1851332bcae51c8907cc",
+ "sha256:131d4be09bea7ce2577f9623e415cab287a3c8e0624f778c1d955ec7c281bd4d",
+ "sha256:144486e029793a733e43b2e37df16a16df4ceb62102636ff3db6033994711066",
+ "sha256:1ddf14031a3882f684b8642cb74eea3af93a2be68893901b2b387c5fd92a03ec",
+ "sha256:1eba476b1b242620c266edf6325b443a2e22b633217a9835a52d8da2b5c051f9",
+ "sha256:20f61c9944f0be2dc2b75689ba409938c14876c19d02f7585af4460b6a21403e",
+ "sha256:22960019a842777a9fa5134c2364efaed5fbf9610ddc5c904bd3a400973b0eb8",
+ "sha256:22e7ebc231d28393dfdc19b185d97e14a0f178bedd78e85aad660e93b646604e",
+ "sha256:23cbb932cc53a86ebde0fb72e7e645f9a5eec1a5af7aa9ce333e46286caef783",
+ "sha256:29c04741b9ae13d1e94cf93fca257730b97ce6ea64cfe1eba11cf9ac4e85afb6",
+ "sha256:2bde29cc44fa81c0a0c8686992c3080b37c488df167a371500b2a43ce9f026d1",
+ "sha256:2cdc55ca07b4e70dda898d2ab7150ecf17c990076d3acd7a5f3b25cb23a69f1c",
+ "sha256:370f6e97d02bf2dd20d7468ce4f38e173a124e769762d00beadec3bc2f4b3bc4",
+ "sha256:395161bbdbd04a8333b9ff9763a05e9ceb4fe210e3c7690f5e68cedd3d65d8e1",
+ "sha256:44136355e2f5e06bf6b23d337a75386371ba742ffa771440b85bed367c1318d1",
+ "sha256:44a6c2f6374e0033873e9ed577a54a3602b4f609867794c1a3ebba65e4c93ee7",
+ "sha256:4919899577ba37f505aaebdf6e7dc812d55e8f097331312db7f1aab18767cce8",
+ "sha256:4b4b1fe58cd102d75ef0552cf17242705ce0759f9695334a56644ad2d83903fe",
+ "sha256:4bdd56ee719a8f751cf5a593476a441c4e56c9b64dc1f0f30902858c4ef8771d",
+ "sha256:4bf41b8b0a80708f7e0384519795e80dcb44d7199a35d52c15cc674d10b3081b",
+ "sha256:4cac3405d8dda8bc6ed499557625585544dd5cbf32072dcc72b5a176cb1271c8",
+ "sha256:4fe7fda2fe7c8890d454f2cbc91d6c01baf206fbc96d89a80241a02985118c0c",
+ "sha256:50921c140561d3db2ab9f5b11c5184846cde686bb5a9dc64cae442926e86f3af",
+ "sha256:5217c25229b6a85049416a5c1e6451e9060a1edcf988641e309dbe3ab26d3e49",
+ "sha256:5352bea8a8f84b89d45ccc503f390a6be77917932b1c98c4cdc3565137acc714",
+ "sha256:542e3e306d1669b25936b64917285cdffcd4f5c6f0247636fec037187bd93542",
+ "sha256:543883e3496c8b6d58bd036c99486c3c8387c2fc01f7a342b760c1ea3158a318",
+ "sha256:586b36ebda81e6c1a9c5a5d0bfdc236399ba6595e1397842fd4a45648c30f35e",
+ "sha256:597f899f4ed42a38df7b0e46714880fb4e19a25c2f66e5c908805466721760f5",
+ "sha256:5a260758454580f11dd8743fa98319bb046037dfab4f7828008909d0aa5292bc",
+ "sha256:5aefb84a301327ad115e9d346c8e2760009131d9d4b4c6b213648d02e2abe144",
+ "sha256:5e6a5567078b3eaed93558842346c9d678e116ab0135e22eb72db8325e90b453",
+ "sha256:5ff525698de226c0ca743bfa71fc6b378cda2ddcf0d22d7c37b1cc925c9650a5",
+ "sha256:61edbca89aa3f5ef7ecac8c23d975fe7261c12665f1d90a6b1af527bba86ce61",
+ "sha256:659175b2144d199560d99a8d13b2228b85e6019b6e09e556209dfb8c37b78a11",
+ "sha256:6a9a19bea8495bb419dc5d38c4519567781cd8d571c72efc6aa959473d10221a",
+ "sha256:6b30bddd61d2a3261f025ad0f9ee2586988c6a00c780a2fb0a92cea2aa702c54",
+ "sha256:6ffd55b5aedc6f25fd8d9f905c9376ca44fcf768673ffb9d160dd6f409bfda73",
+ "sha256:702d8fc6f25bbf412ee706bd73019da5e44a8400861dfff7ff31eb5b4a1276dc",
+ "sha256:74bcab50a13960f2a610cdcd066e25f1fd59e23b69637c92ad470784a51b1347",
+ "sha256:75f591b2055523fc02a4bbe598aa867df9e953255f0b7f7715d2a36a9c30065c",
+ "sha256:763b64853b0a8f4f9cfb41a76a4a85a9bcda7fdda5cb057016e7706fde928e66",
+ "sha256:76c598ca73ec73a2f568e2a72ba46c3b6c8690ad9a07092b18e48ceb936e9f0c",
+ "sha256:78d680ef3e4d405f36f0d6d1ea54e740366f061645930072d39bca16a10d8c93",
+ "sha256:7b280948d00bd3973c1998f92e22aa3ecb76682e3a4255f33e1020bd32adf443",
+ "sha256:7db345956ecce0c99b97b042b4ca7326feeec6b75facd8390af73b18e2650ffc",
+ "sha256:7dbdce0c534bbf52274b94768b3498abdf675a691fec5f751b6057b3030f34c1",
+ "sha256:7ef6b5942e6bfc5706301a18a62300c60db9af7f6368042227ccb7eeb22d0892",
+ "sha256:7f5a3ffc731494f1a57bd91c47dc483a1e10048131ffb52d901bfe2beb6102e8",
+ "sha256:8a45b6514861916c429e6059a55cf7db74670eaed2052a648e3e4d04f070e001",
+ "sha256:8ad241da7fac963d7573cc67a064c57c58766b62a9a20c452ca1f21050868dfa",
+ "sha256:8b0886885f7323beea6f552c28bff62cbe0983b9fbb94126531693ea6c5ebb90",
+ "sha256:8ca88da1bd78990b536c4a7765f719803eb4f8f9971cc22d6ca965c10a7f2c4c",
+ "sha256:8e0caeff18b96ea90fc0eb6e3bdb2b10ab5b01a95128dfeccb64a7238decf5f0",
+ "sha256:957403a978e10fb3ca42572a23e6f7badff39aa1ce2f4ade68ee452dc6807692",
+ "sha256:9af69f6746120998cd9c355e9c3c6aec7dff70d47247188feb4f829502be8ab4",
+ "sha256:9c94f7cc91ab16b36ba5ce476f1904c91d6c92441f01cd61a8e2729442d6fcf5",
+ "sha256:a37d51fa9a00d265cf73f3de3930fa9c41548177ba4f0faf76e61d512c774690",
+ "sha256:a3a98921da9a1bf8457aeee6a551948a83601689e5ecdd736894ea9bbec77e83",
+ "sha256:a3c1ebd4ed8e76e886507c9eddb1a891673686c813adf889b864a17fafcf6d66",
+ "sha256:a5f9505efd574d1e5b4a76ac9dd92a12acb2b309551e9aa874c13c11caefbe4f",
+ "sha256:a8ff454ef0bb061e37df03557afda9d785c905dab15584860f982e88be73015f",
+ "sha256:a9d0b68ac1743964755ae2d89772c7e6fb0118acd4d0b7464eaf3921c6b49dd4",
+ "sha256:aa62a07ac93b7cb6b7d0389d8ef57ffc321d78f60c037b19dfa78d6b17c928ee",
+ "sha256:ac741bf78b9bb432e2d314439275235f41656e189856b11fb4e774d9f7246d81",
+ "sha256:ae1e96785696b543394a4e3f15f3f225d44f3c55dafe3f206493031419fedf95",
+ "sha256:b683e5fd7f74fb66e89a1ed16076dbab3f8e9f34c18b1979ded614fe10cdc4d9",
+ "sha256:b7a8b43ee64ca8f4befa2bea4083f7c52c92864d8518244bfa6e88c751fa8fff",
+ "sha256:b8e38472739028e5f2c3a4aded0ab7eadc447f0d84f310c7a8bb697ec417229e",
+ "sha256:bfff48c7bd23c6e2aec6454aaf6edc44444b229e94743b34bdcdda2e35126cf5",
+ "sha256:c14b63c9d7bab795d17392c7c1f9aaabbffd4cf4387725a0ac69109fb3b550c6",
+ "sha256:c27cc1e4b197092e50ddbf0118c788d9977f3f8f35bfbbd3e76c1846a3443df7",
+ "sha256:c28d3309ebd6d6b2cf82969b5179bed5fefe6142c70f354ece94324fa11bf6a1",
+ "sha256:c670f4773f2f6f1957ff8a3962c7dd12e4be54d05839b216cb7fd70b5a1df394",
+ "sha256:ce6910b56b700bea7be82c54ddf2e0ed792a577dfaa4a76b9af07d550af435c6",
+ "sha256:d0213671691e341f6849bf33cd9fad21f7b1cb88b89e024f33370733fec58742",
+ "sha256:d03fe67b2325cb3f09be029fd5da8df9e6974f0cde2c2ac6a79d2634e791dd57",
+ "sha256:d0e5af9a9effb88535a472e19169e09ce750c3d442fb222254a276d77808620b",
+ "sha256:d243b36fbf3d73c25e48014961e83c19c9cc92530516ce3c43050ea6276a2ab7",
+ "sha256:d26166acf62f731f50bdd885b04b38828436d74e8e362bfcb8df221d868b5d9b",
+ "sha256:d403d781b0e06d2922435ce3b8d2376579f0c217ae491e273bab8d092727d244",
+ "sha256:d8716f82502997b3d0895d1c64c3b834181b1eaca28f3f6336a71777e437c2af",
+ "sha256:e4f781ffedd17b0b834c8731b75cce2639d5a8afe961c1e58ee7f1f20b3af185",
+ "sha256:e613a98ead2005c4ce037c7b061f2409a1a4e45099edb0ef3200ee26ed2a69a8",
+ "sha256:ef4163770525257876f10e8ece1cf25b71468316f61451ded1a6f44273eedeb5"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.10.31"
},
"requests": {
"hashes": [
- "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
- "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
+ "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
+ "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
],
"index": "ia",
- "version": "==2.22.0"
+ "version": "==2.28.1"
},
"requests-file": {
"hashes": [
- "sha256:75c175eed739270aec3c5279ffd74e6527dada275c5c0d76b5817e9c86bb7dea",
- "sha256:8f04aa6201bacda0567e7ac7f677f1499b0fc76b22140c54bc06edf1ba92e2fa"
+ "sha256:07d74208d3389d01c38ab89ef403af0cfec63957d53a0081d8eca738d0247d8e",
+ "sha256:dfe5dae75c12481f68ba353183c53a65e6044c923e64c24b2209f6c7570ca953"
],
- "version": "==1.4.3"
+ "version": "==1.5.1"
},
"robotexclusionrulesparser": {
"hashes": [
@@ -548,85 +1112,204 @@
},
"s3transfer": {
"hashes": [
- "sha256:6efc926738a3cd576c2a79725fed9afde92378aa5c6a957e3af010cb019fac9d",
- "sha256:b780f2411b824cb541dbcd2c713d0cb61c7d1bcadae204cdddda2b35cef493ba"
+ "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd",
+ "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"
],
- "version": "==0.2.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==0.6.0"
},
"schedule": {
"hashes": [
- "sha256:3f895a1036799a25ab9c335de917073e63cf8256920917e932777382f101f08f",
- "sha256:f9fb5181283de4db6e701d476dd01b6a3dd81c38462a54991ddbb9d26db857c9"
+ "sha256:617adce8b4bf38c360b781297d59918fbebfb2878f1671d189f4f4af5d0567a4",
+ "sha256:e6ca13585e62c810e13a08682e0a6a8ad245372e376ba2b8679294f377dfc8e4"
],
- "version": "==0.6.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.1.0"
},
"schema": {
"hashes": [
- "sha256:10b550886f5ff402e1fdef85bd7be761b0e09a35a43633311807a57a5bc4db50",
- "sha256:c9dc8f4624e287c7d1435f8fd758f6a0aabbb7eff442db9192cd46f0e2b6d959"
+ "sha256:f06717112c61895cabc4707752b88716e8420a8819d71404501e114f91043197",
+ "sha256:f3ffdeeada09ec34bf40d7d79996d9f7175db93b7a5065de0faa7f41083c1e6c"
],
- "version": "==0.7.1"
+ "version": "==0.7.5"
},
- "six": {
+ "selectolax": {
+ "hashes": [
+ "sha256:010b008aca04be6cf9727d6f206a583d79a82d397126a101f57f117113a082bb",
+ "sha256:0878aa1ab3906831b20ad9e316a77c8401030dd388f3c1c72ba51bc08d497584",
+ "sha256:087e663c0ba6d9d79294508b0a3145079e838950a0e2fc7b8b1485da3fe24254",
+ "sha256:0a8dddd34dea642429629aae21cf940668eaa1c66ab0bcf9970d72f38676697d",
+ "sha256:14c9368f9dd224f895ef1431b1961d6e9a56fb26a95b5c04900def7b8961744c",
+ "sha256:17ac0b2b4222ba2c16852c0035dcd31d9e100544e6a5138f6e01f6b1648691b5",
+ "sha256:1ba1cd707a0d0090cffb2851ec6ccfdc334ed0c2ea08ae8705a9f6c97a997f77",
+ "sha256:1d38157e2358dacf55e782d332b41391821b2ef237e34e47ff276b2184c96542",
+ "sha256:1f1ec20cc75e1866f7758e543907da222c5d8072e580cf6814f2f142036c695f",
+ "sha256:1fa1737b7031b467d8613919503c85482a59c65ac91fe60074180e625e2533c6",
+ "sha256:221051ffe8c2950e9ebe41e08103397a7b287dca05a9e8084bb9e925f2d9c556",
+ "sha256:264918c1e9e6f6657f47116e4dbd74b57c660d3e86f9cc78209f132c56c8e9e5",
+ "sha256:2d8c7ce06bdf83d3cd2a617211eec48c875826bae54c74e56aec2635daac2f31",
+ "sha256:31fb0fbc88674b3346e379664c5837070e79b2f65eab3e29b7c43e1b4fc1137c",
+ "sha256:3600747c5072725580f8dc249a40ae123840f22edab950f43b349d356f44268b",
+ "sha256:3d65d0c57cfa1b05beb5c72d3cb566f4fdaf16e5112082f300cfa6bd94836aff",
+ "sha256:3daaf7ec54565d3f15f9ce046f6a8e469d966dc4fc879af8c7f753d37994f70e",
+ "sha256:418738a2f46beea2444a1587adb4f509bdd8e7ddffac071dba097c1a3ddb8cfc",
+ "sha256:46776ca482a76b3f522e4d8f90474716e4da51dc2823f3ecc6a2ff38ef0663b7",
+ "sha256:46bacca9e9f077ff2c5a973c05b8862425f077c58f2dca8059b992ceaca6b6de",
+ "sha256:4c5c68f0139d0928298ef5e95137996e0efb6f8db364b1470221e8710834a0ab",
+ "sha256:51c33d33e4e4eec0d9c1b6accdda5c93f4e3a00b28e99fc4ebb2b95d1d4ef885",
+ "sha256:585a75f4aff85b48d0fc8f3e9afbd1e2c05902a332982d04bab93e8e1db2e4a4",
+ "sha256:5acbe02c26b43428c2f49e8f09a81bd47be7ea969c6798cde1a23c2b33d25c79",
+ "sha256:6111ac9e5ca02b13d8e3057c1e20d6608435c64a11f92460a59951a7209c2cf3",
+ "sha256:67c32c29bc9011ed1b6fd67a961073e69d67bf60bf09f3db54d6240c034719f4",
+ "sha256:68c42af2cabecf04528dff2d0bbebbecfbafc394a5192b6a5b3e1dcd19eeb766",
+ "sha256:709b1680a16f210c43e4f3240dfc15e3312ccd43c9ea20c8e20c81470214cfc6",
+ "sha256:762e91a0ac0caa2d8731568e5b2ad0cec6fc06465a9dd89280118ced4b7e0849",
+ "sha256:7d47e489a8b0181992a3384987c854bd88211685e1c32dcdcb8746ec98dbcf7e",
+ "sha256:7ebe824763782f0e6ad2accd57d0cef3a61922b72be99ccafebe0154e9b8aef6",
+ "sha256:7f1a35be9413bcd56f225b1509740ea8999a6f7558e0f0a50a4ca80b91bf11be",
+ "sha256:81c7847ff0f3561559bd98015aa3fe0a2dfb26966156f7704f7f65339d48e81c",
+ "sha256:9246bf586afaacfdc0e6fb17806ee0d3e1736d3d13a87c8e96214596d50576b7",
+ "sha256:9baff22ae7015e8f2697d5db0804ee379d53fa6e54f1dc7e9f61ee8ccb1bdb2e",
+ "sha256:a4634d7c7e9d2eb65d0fc7fe0d88641eb413cb7250fbfc66b3b4d88d49e4c724",
+ "sha256:a7fa03253260c3351f61cef36865b27ad4585516e9ac4a77244d237bfaf37f13",
+ "sha256:abac4b7afe430dd135f148d4001b593b09c8f64fccd63b15fbb03b77735e3405",
+ "sha256:ad0cfc7f66a2863d199af819c79bfa160bcc830e0f83fd5391cdd80e545af758",
+ "sha256:adabfb5635d00da49bddef3844dc65ca3da81acd889ea7be2a74ef9456558f36",
+ "sha256:ae58e7cc282a768a68abbfa39eff895788a39658c5a235524c21b09d182b3d3a",
+ "sha256:b348074bc3a0e16e9af1a2f57e0da18f5def97e415c6435dadc68aead7ccf060",
+ "sha256:b48e4c8df2c226552ac18636c2ebe9d100ff3daa8742616687bd2cbf74a81e2f",
+ "sha256:c23d9f82aea887347151538a58b15a8dbee4261e4114705c0974dee81eb796e0",
+ "sha256:c2b589be0dd45d62ec43a6446f09919b5be809c708d8ff6a7cb86acd9150091b",
+ "sha256:d13904fc037bcebc6d79e83c0a19e64cc9d4771cd7f27b325c63d1071ec0d0f0",
+ "sha256:d3506e831b972c1eb22538b25e7c991289b72b2e028bd27b633dfbd21c1a511a",
+ "sha256:d809fbf258c28190160b3fe5d34adddb1da44ed7a2f800b7125e0fac6e940016",
+ "sha256:da688ca957d68b8072dc9658506c07326f6332ff3fe03214fec375a4ccc67f8a",
+ "sha256:e001a40b25e478f8390c3898c5852cf9a226668ba02fdc4d8e3a4788ce64207a",
+ "sha256:e805b106edac716047afc6e9e49953242207909bfbb70bf47c53f231e2d27d74",
+ "sha256:eb86cacac6ed203c386afe6704732fb05d831006c65869f15f41d15e9e72973b",
+ "sha256:f5cef3310fc41f71e8fc19d05534d100f6c02789d46041777b0bbd70961a94ec",
+ "sha256:f76b0ad63b55e45d3c02e50ca8b8ef64a500aed9a5f50818173b66949470f8e4",
+ "sha256:fad7fb68e929082e6474e1392dd433d465b06b59e26158ef67813c0c8e5b7f66",
+ "sha256:fb3b3425ee21f5098531ce80dc48d99a555b8b2300deb0ddf84b6bc503f0a848",
+ "sha256:fc53731aa81617694667d4c56d21a9e26df840a219f4b62588af80c6781ba613"
+ ],
+ "index": "ia",
+ "version": "==0.3.11"
+ },
+ "sentry-sdk": {
+ "extras": [],
"hashes": [
- "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
- "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ "sha256:5bbe4b72de22f9ac1e67f2a4e6efe8fbd595bb59b7b223443f50fe5802a5551c",
+ "sha256:9f0b960694e2d8bb04db4ba6ac2a645040caef4e762c65937998ff06064f10d6"
],
- "version": "==1.12.0"
+ "index": "ia",
+ "version": "==1.12.1"
},
- "soupsieve": {
+ "six": {
"hashes": [
- "sha256:8662843366b8d8779dec4e2f921bebec9afd856a5ff2e82cd419acc5054a1a92",
- "sha256:a5a6166b4767725fd52ae55fee8c8b6137d9a51e9f1edea461a062a759160118"
+ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+ "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
- "version": "==1.9.3"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==1.16.0"
},
- "sqlalchemy": {
+ "soupsieve": {
"hashes": [
- "sha256:2f8ff566a4d3a92246d367f2e9cd6ed3edeef670dcd6dda6dfdc9efed88bcd80"
+ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
+ "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
],
- "version": "==1.3.8"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.3.2.post1"
},
"surt": {
"hashes": [
- "sha256:24167eb6c01f24f757eef9bca6bf0ec089ec05ad5b6213c3b727a5e58c0c4720",
- "sha256:5691e63b189af04aa1fb178ecce5fc7d872cc582e2b6861d4500f6d41915306a"
+ "sha256:24167eb6c01f24f757eef9bca6bf0ec089ec05ad5b6213c3b727a5e58c0c4720"
],
"version": "==0.3.1"
},
- "tabulate": {
+ "tld": {
"hashes": [
- "sha256:d0097023658d4dea848d6ae73af84532d1e86617ac0925d1adf1dd903985dac3"
+ "sha256:266106ad9035f54cd5cce5f823911a51f697e7c58cb45bfbd6c53b4c2976ece2",
+ "sha256:69fed19d26bb3f715366fb4af66fdeace896c55c052b00e8aaba3a7b63f3e7f0",
+ "sha256:826bbe61dccc8d63144b51caef83e1373fbaac6f9ada46fca7846021f5d36fef",
+ "sha256:843844e4256c943983d86366b5af3ac9cd1c9a0b6465f04d9f70e3b4c1a7989f",
+ "sha256:a92ac6b84917e7d9e934434b8d37e9be534598f138fbb86b3c0d5426f2621890",
+ "sha256:b6650f2d5392a49760064bc55d73ce3397a378ef24ded96efb516c6b8ec68c26",
+ "sha256:ef5b162d6fa295822dacd4fe4df1b62d8df2550795a97399a8905821b58d3702"
],
- "version": "==0.8.5"
+ "markers": "python_version >= '2.7' and python_version < '4'",
+ "version": "==0.12.6"
},
"tldextract": {
"hashes": [
- "sha256:2c1c5d9d454f79734b4f3da0d603856dd9f820753410a3e9abf0a0c9fde33e97",
- "sha256:b72bef6013de67c7fa181250bc2c2e089a994d259c09ca95a9771f2f97e29ed1"
+ "sha256:47aa4d8f1a4da79a44529c9a2ddc518663b25d371b805194ec5ce2a5f615ccd2",
+ "sha256:78aef13ac1459d519b457a03f1f74c1bf1c2808122a6bcc0e6840f81ba55ad73"
],
- "version": "==2.2.1"
+ "markers": "python_version >= '3.7'",
+ "version": "==3.4.0"
},
- "total-ordering": {
+ "tqdm": {
"hashes": [
- "sha256:a14a2a138a52befaa02b3fd53eb3366f66da69020be299af3cf0b54c9441aacc"
+ "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4",
+ "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"
],
- "version": "==0.1.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==4.64.1"
+ },
+ "trafilatura": {
+ "hashes": [
+ "sha256:a66189e4b9d591dce648f0cc79fb52a486e679708090189bc4fcd88068f095ef",
+ "sha256:c2bc0cbac6248363d938666cbedbb067ad8aefe31667c88038135b93efd475c3"
+ ],
+ "index": "ia",
+ "version": "==1.3.0"
},
"twitter": {
"hashes": [
- "sha256:52545fd3b70d3d3807d3ce62d1a256727856d784d1630d64dedcc643aaf0b908",
- "sha256:acdc85e5beea752967bb64c63bde8b915c49a31a01db1b2fecccf9f2c1d5c44d"
+ "sha256:1d9a3e45f2c440f308a7116d3672b0d1981aba8ac41cb7f3ed270ed50693f0e0",
+ "sha256:80ddd69ae2eeb88313feedeea31bf119fd6e79541ee5b37abb9c43d233194e10"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "version": "==1.19.6"
+ },
+ "typing-extensions": {
+ "hashes": [
+ "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
+ "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
+ },
+ "tzdata": {
+ "hashes": [
+ "sha256:2b88858b0e3120792a3c0635c23daf36a7d7eeeca657c323da299d2094402a0d",
+ "sha256:fe5f866eddd8b96e9fcba978f8e503c909b19ea7efda11e52e39494bad3a7bfa"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.7"
+ },
+ "tzlocal": {
+ "hashes": [
+ "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
+ "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
],
- "version": "==1.18.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==4.2"
+ },
+ "urlcanon": {
+ "hashes": [
+ "sha256:30f5bf0e2e4a0feb6dd9ee139a4180a5d493117e8a1448569da3d73c18b92b62"
+ ],
+ "index": "ia",
+ "version": "==0.3.1"
},
"urllib3": {
"hashes": [
- "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
- "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
+ "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc",
+ "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"
],
- "markers": "python_version >= '3.4'",
- "version": "==1.22"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.26.13"
},
"warctools": {
"hashes": [
@@ -635,131 +1318,271 @@
"version": "==4.10.0"
},
"wayback": {
- "hashes": [
- "sha256:b7e709530b98b42811e24230d025513b652c3a68413fe80e3f4b9f66cf7e84d2"
+ "extras": [
+ "brotli"
],
- "index": "ia",
- "version": "==0.4.7.5"
- },
- "wayback-esp": {
"hashes": [
- "sha256:546a0cb99a8f0c94ad4fbfd7ceb9c24503c6ef14af13456addc68d44289ab690"
+ "sha256:3a3f149508d68ec53f5cdf434a45e5bb906beef731327d7bd2ef6b751c98281b"
],
- "version": "==0.2.4"
+ "index": "ia",
+ "version": "==0.8.6.1"
},
"wayback-search-js": {
"hashes": [
- "sha256:400bb43d44ec2399001508763be7c8409aa4cc59f9f97e0ba9bf195d4bba1092"
+ "sha256:a474ba8da58f9cc27b1dce7f87a8cc7d119715ab4bab750dcc1d90f002074161"
],
- "version": "==2.9.0"
+ "version": "==3.1.21"
},
"wbex-client": {
"hashes": [
- "sha256:447611c3df85175854c063ed784bb928f03262ad9a50fab2d74531c59200d94c"
+ "sha256:8c4028d744dda05cca932b411a826f9478a65cbc018784bff9528e973c7f9c36"
],
- "version": "==0.1.5"
+ "version": "==0.1.6.1"
},
"wcwidth": {
"hashes": [
- "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
- "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+ "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784",
+ "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"
],
- "version": "==0.1.7"
+ "version": "==0.2.5"
},
"werkzeug": {
"hashes": [
- "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7",
- "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4"
+ "sha256:1421ebfc7648a39a5c58c601b154165d05cf47a3cd0ccb70857cbdacf6c8f2b8",
+ "sha256:b863f8ff057c522164b6067c9e28b041161b4be5ba4d0daceeaa50a163822d3c"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2.0.3"
+ },
+ "zstandard": {
+ "hashes": [
+ "sha256:04c298d381a3b6274b0a8001f0da0ec7819d052ad9c3b0863fe8c7f154061f76",
+ "sha256:0fde1c56ec118940974e726c2a27e5b54e71e16c6f81d0b4722112b91d2d9009",
+ "sha256:126aa8433773efad0871f624339c7984a9c43913952f77d5abeee7f95a0c0860",
+ "sha256:1a4fb8b4ac6772e4d656103ccaf2e43e45bd16b5da324b963d58ef360d09eb73",
+ "sha256:2e4812720582d0803e84aefa2ac48ce1e1e6e200ca3ce1ae2be6d410c1d637ae",
+ "sha256:2f01b27d0b453f07cbcff01405cdd007e71f5d6410eb01303a16ba19213e58e4",
+ "sha256:31d12fcd942dd8dbf52ca5f6b1bbe287f44e5d551a081a983ff3ea2082867863",
+ "sha256:3c927b6aa682c6d96225e1c797f4a5d0b9f777b327dea912b23471aaf5385376",
+ "sha256:3d5bb598963ac1f1f5b72dd006adb46ca6203e4fb7269a5b6e1f99e85b07ad38",
+ "sha256:401508efe02341ae681752a87e8ac9ef76df85ef1a238a7a21786a489d2c983d",
+ "sha256:4514b19abe6dbd36d6c5d75c54faca24b1ceb3999193c5b1f4b685abeabde3d0",
+ "sha256:47dfa52bed3097c705451bafd56dac26535545a987b6759fa39da1602349d7ba",
+ "sha256:4fa496d2d674c6e9cffc561639d17009d29adee84a27cf1e12d3c9be14aa8feb",
+ "sha256:55a513ec67e85abd8b8b83af8813368036f03e2d29a50fc94033504918273980",
+ "sha256:55b3187e0bed004533149882ef8c24e954321f3be81f8a9ceffe35099b82a0d0",
+ "sha256:593f96718ad906e24d6534187fdade28b611f8ed06e27ba972ba48aecec45fc6",
+ "sha256:5e21032efe673b887464667d09406bab6e16d96b09ad87e80859e3a20b6745b6",
+ "sha256:60a86b7b2b1c300779167cf595e019e61afcc0e20c4838692983a921db9006ac",
+ "sha256:619f9bf37cdb4c3dc9d4120d2a1003f5db9446f3618a323219f408f6a9df6725",
+ "sha256:660b91eca10ee1b44c47843894abe3e6cfd80e50c90dee3123befbf7ca486bd3",
+ "sha256:67710d220af405f5ce22712fa741d85e8b3ada7a457ea419b038469ba379837c",
+ "sha256:6caed86cd47ae93915d9031dc04be5283c275e1a2af2ceff33932071f3eeff4d",
+ "sha256:6d2182e648e79213b3881998b30225b3f4b1f3e681f1c1eaf4cacf19bde1040d",
+ "sha256:72758c9f785831d9d744af282d54c3e0f9db34f7eae521c33798695464993da2",
+ "sha256:74c2637d12eaacb503b0b06efdf55199a11b1d7c580bd3dd9dfe84cac97ef2f6",
+ "sha256:755020d5aeb1b10bffd93d119e7709a2a7475b6ad79c8d5226cea3f76d152ce0",
+ "sha256:7ccc4727300f223184520a6064c161a90b5d0283accd72d1455bcd85ec44dd0d",
+ "sha256:81ab21d03e3b0351847a86a0b298b297fde1e152752614138021d6d16a476ea6",
+ "sha256:8371217dff635cfc0220db2720fc3ce728cd47e72bb7572cca035332823dbdfc",
+ "sha256:876567136b0359f6581ecd892bdb4ca03a0eead0265db73206c78cff03bcdb0f",
+ "sha256:879411d04068bd489db57dcf6b82ffad3c5fb2a1fdd30817c566d8b7bedee442",
+ "sha256:898500957ae5e7f31b7271ace4e6f3625b38c0ac84e8cedde8de3a77a7fdae5e",
+ "sha256:8c9ca56345b0c5574db47560603de9d05f63cce5dfeb3a456eb60f3fec737ff2",
+ "sha256:8ec2c146e10b59c376b6bc0369929647fcd95404a503a7aa0990f21c16462248",
+ "sha256:8f7c68de4f362c1b2f426395fe4e05028c56d0782b2ec3ae18a5416eaf775576",
+ "sha256:909bdd4e19ea437eb9b45d6695d722f6f0fd9d8f493e837d70f92062b9f39faf",
+ "sha256:9d97c713433087ba5cee61a3e8edb54029753d45a4288ad61a176fa4718033ce",
+ "sha256:a65e0119ad39e855427520f7829618f78eb2824aa05e63ff19b466080cd99210",
+ "sha256:aa9087571729c968cd853d54b3f6e9d0ec61e45cd2c31e0eb8a0d4bdbbe6da2f",
+ "sha256:aef0889417eda2db000d791f9739f5cecb9ccdd45c98f82c6be531bdc67ff0f2",
+ "sha256:b253d0c53c8ee12c3e53d181fb9ef6ce2cd9c41cbca1c56a535e4fc8ec41e241",
+ "sha256:b80f6f6478f9d4ca26daee6c61584499493bf97950cfaa1a02b16bb5c2c17e70",
+ "sha256:be6329b5ba18ec5d32dc26181e0148e423347ed936dda48bf49fb243895d1566",
+ "sha256:c7560f622e3849cc8f3e999791a915addd08fafe80b47fcf3ffbda5b5151047c",
+ "sha256:d1a7a716bb04b1c3c4a707e38e2dee46ac544fff931e66d7ae944f3019fc55b8",
+ "sha256:d63b04e16df8ea21dfcedbf5a60e11cbba9d835d44cb3cbff233cfd037a916d5",
+ "sha256:d777d239036815e9b3a093fa9208ad314c040c26d7246617e70e23025b60083a",
+ "sha256:e892d3177380ec080550b56a7ffeab680af25575d291766bdd875147ba246a91",
+ "sha256:e9c90a44470f2999779057aeaf33461cbd8bb59d8f15e983150d10bb260e16e0",
+ "sha256:f097dda5d4f9b9b01b3c9fa2069f9c02929365f48f341feddf3d6b32510a2f93",
+ "sha256:f4ebfe03cbae821ef994b2e58e4df6a087470cc522aca502614e82a143365d45"
],
- "version": "==0.16.0"
+ "index": "ia",
+ "version": "==0.19.0"
}
},
"develop": {
"astroid": {
"hashes": [
- "sha256:9b3f17b0550f82e28a6776a4e5222441f48e523b0773df4bc505bb6b7c2093b7",
- "sha256:c7e2e5773d87ccc00d01c273e439386f4d6d63cce61317a79ccce5880162f9fb"
+ "sha256:10e0ad5f7b79c435179d0d0f0df69998c4eef4597534aae44910db060baeb907",
+ "sha256:1493fe8bd3dfd73dc35bd53c9d5b6e49ead98497c47b2307662556a5692d29d7"
],
- "version": "==2.3.0"
+ "markers": "python_full_version >= '3.7.2'",
+ "version": "==2.12.13"
},
- "atomicwrites": {
+ "asttokens": {
"hashes": [
- "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
- "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
+ "sha256:4622110b2a6f30b77e1473affaa97e711bc2f07d3f10848420ff1898edbe94f3",
+ "sha256:6b0ac9e93fb0335014d382b8fa9b3afa7df546984258005da0b9e7095b3deb1c"
],
- "version": "==1.3.0"
+ "version": "==2.2.1"
},
"attrs": {
"hashes": [
- "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
- "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
+ "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836",
+ "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"
],
- "version": "==19.1.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==22.2.0"
},
"backcall": {
"hashes": [
- "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
- "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
+ "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e",
+ "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255"
],
- "version": "==0.1.0"
+ "version": "==0.2.0"
+ },
+ "black": {
+ "hashes": [
+ "sha256:101c69b23df9b44247bd88e1d7e90154336ac4992502d4197bdac35dd7ee3320",
+ "sha256:159a46a4947f73387b4d83e87ea006dbb2337eab6c879620a3ba52699b1f4351",
+ "sha256:1f58cbe16dfe8c12b7434e50ff889fa479072096d79f0a7f25e4ab8e94cd8350",
+ "sha256:229351e5a18ca30f447bf724d007f890f97e13af070bb6ad4c0a441cd7596a2f",
+ "sha256:436cc9167dd28040ad90d3b404aec22cedf24a6e4d7de221bec2730ec0c97bcf",
+ "sha256:559c7a1ba9a006226f09e4916060982fd27334ae1998e7a38b3f33a37f7a2148",
+ "sha256:7412e75863aa5c5411886804678b7d083c7c28421210180d67dfd8cf1221e1f4",
+ "sha256:77d86c9f3db9b1bf6761244bc0b3572a546f5fe37917a044e02f3166d5aafa7d",
+ "sha256:82d9fe8fee3401e02e79767016b4907820a7dc28d70d137eb397b92ef3cc5bfc",
+ "sha256:9eedd20838bd5d75b80c9f5487dbcb06836a43833a37846cf1d8c1cc01cef59d",
+ "sha256:c116eed0efb9ff870ded8b62fe9f28dd61ef6e9ddd28d83d7d264a38417dcee2",
+ "sha256:d30b212bffeb1e252b31dd269dfae69dd17e06d92b87ad26e23890f3efea366f"
+ ],
+ "index": "ia",
+ "version": "==22.12.0"
},
"certifi": {
"hashes": [
- "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50",
- "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef"
+ "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
+ "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
],
- "version": "==2019.9.11"
+ "markers": "python_version >= '3.6'",
+ "version": "==2022.12.7"
},
- "chardet": {
+ "charset-normalizer": {
"hashes": [
- "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
- "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
+ "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
],
- "version": "==3.0.4"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.1.1"
},
- "coverage": {
+ "click": {
"hashes": [
- "sha256:08907593569fe59baca0bf152c43f3863201efb6113ecb38ce7e97ce339805a6",
- "sha256:0be0f1ed45fc0c185cfd4ecc19a1d6532d72f86a2bac9de7e24541febad72650",
- "sha256:141f08ed3c4b1847015e2cd62ec06d35e67a3ac185c26f7635f4406b90afa9c5",
- "sha256:19e4df788a0581238e9390c85a7a09af39c7b539b29f25c89209e6c3e371270d",
- "sha256:23cc09ed395b03424d1ae30dcc292615c1372bfba7141eb85e11e50efaa6b351",
- "sha256:245388cda02af78276b479f299bbf3783ef0a6a6273037d7c60dc73b8d8d7755",
- "sha256:331cb5115673a20fb131dadd22f5bcaf7677ef758741312bee4937d71a14b2ef",
- "sha256:386e2e4090f0bc5df274e720105c342263423e77ee8826002dcffe0c9533dbca",
- "sha256:3a794ce50daee01c74a494919d5ebdc23d58873747fa0e288318728533a3e1ca",
- "sha256:60851187677b24c6085248f0a0b9b98d49cba7ecc7ec60ba6b9d2e5574ac1ee9",
- "sha256:63a9a5fc43b58735f65ed63d2cf43508f462dc49857da70b8980ad78d41d52fc",
- "sha256:6b62544bb68106e3f00b21c8930e83e584fdca005d4fffd29bb39fb3ffa03cb5",
- "sha256:6ba744056423ef8d450cf627289166da65903885272055fb4b5e113137cfa14f",
- "sha256:7494b0b0274c5072bddbfd5b4a6c6f18fbbe1ab1d22a41e99cd2d00c8f96ecfe",
- "sha256:826f32b9547c8091679ff292a82aca9c7b9650f9fda3e2ca6bf2ac905b7ce888",
- "sha256:93715dffbcd0678057f947f496484e906bf9509f5c1c38fc9ba3922893cda5f5",
- "sha256:9a334d6c83dfeadae576b4d633a71620d40d1c379129d587faa42ee3e2a85cce",
- "sha256:af7ed8a8aa6957aac47b4268631fa1df984643f07ef00acd374e456364b373f5",
- "sha256:bf0a7aed7f5521c7ca67febd57db473af4762b9622254291fbcbb8cd0ba5e33e",
- "sha256:bf1ef9eb901113a9805287e090452c05547578eaab1b62e4ad456fcc049a9b7e",
- "sha256:c0afd27bc0e307a1ffc04ca5ec010a290e49e3afbe841c5cafc5c5a80ecd81c9",
- "sha256:dd579709a87092c6dbee09d1b7cfa81831040705ffa12a1b248935274aee0437",
- "sha256:df6712284b2e44a065097846488f66840445eb987eb81b3cc6e4149e7b6982e1",
- "sha256:e07d9f1a23e9e93ab5c62902833bf3e4b1f65502927379148b6622686223125c",
- "sha256:e2ede7c1d45e65e209d6093b762e98e8318ddeff95317d07a27a2140b80cfd24",
- "sha256:e4ef9c164eb55123c62411f5936b5c2e521b12356037b6e1c2617cef45523d47",
- "sha256:eca2b7343524e7ba246cab8ff00cab47a2d6d54ada3b02772e908a45675722e2",
- "sha256:eee64c616adeff7db37cc37da4180a3a5b6177f5c46b187894e633f088fb5b28",
- "sha256:ef824cad1f980d27f26166f86856efe11eff9912c4fed97d3804820d43fa550c",
- "sha256:efc89291bd5a08855829a3c522df16d856455297cf35ae827a37edac45f466a7",
- "sha256:fa964bae817babece5aa2e8c1af841bebb6d0b9add8e637548809d040443fee0",
- "sha256:ff37757e068ae606659c28c3bd0d923f9d29a85de79bf25b2b34b148473b5025"
- ],
- "version": "==4.5.4"
+ "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e",
+ "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==8.1.3"
+ },
+ "coverage": {
+ "extras": [
+ "toml"
+ ],
+ "hashes": [
+ "sha256:07bcfb1d8ac94af886b54e18a88b393f6a73d5959bb31e46644a02453c36e475",
+ "sha256:09f6b5a8415b6b3e136d5fec62b552972187265cb705097bf030eb9d4ffb9b60",
+ "sha256:0a79137fc99815fff6a852c233628e735ec15903cfd16da0f229d9c4d45926ab",
+ "sha256:0b4b3a4d9915b2be879aff6299c0a6129f3d08a775d5a061f503cf79571f73e4",
+ "sha256:1285648428a6101b5f41a18991c84f1c3959cee359e51b8375c5882fc364a13f",
+ "sha256:12a5aa77783d49e05439fbe6e6b427484f8a0f9f456b46a51d8aac022cfd024d",
+ "sha256:19ec666533f0f70a0993f88b8273057b96c07b9d26457b41863ccd021a043b9a",
+ "sha256:1e414dc32ee5c3f36544ea466b6f52f28a7af788653744b8570d0bf12ff34bc0",
+ "sha256:2c44fcfb3781b41409d0f060a4ed748537557de9362a8a9282182fafb7a76ab4",
+ "sha256:397b4a923cc7566bbc7ae2dfd0ba5a039b61d19c740f1373791f2ebd11caea59",
+ "sha256:3cfc595d2af13856505631be072835c59f1acf30028d1c860b435c5fc9c15b69",
+ "sha256:3dd4ee135e08037f458425b8842d24a95a0961831a33f89685ff86b77d378f89",
+ "sha256:486ee81fa694b4b796fc5617e376326a088f7b9729c74d9defa211813f3861e4",
+ "sha256:4f943a3b2bc520102dd3e0bb465e1286e12c9a54f58accd71b9e65324d9c7c01",
+ "sha256:63d56165a7c76265468d7e0c5548215a5ba515fc2cba5232d17df97bffa10f6c",
+ "sha256:66b18c3cf8bbab0cce0d7b9e4262dc830e93588986865a8c78ab2ae324b3ed56",
+ "sha256:691571f31ace1837838b7e421d3a09a8c00b4aac32efacb4fc9bd0a5c647d25a",
+ "sha256:6c5ad996c6fa4d8ed669cfa1e8551348729d008a2caf81489ab9ea67cfbc7498",
+ "sha256:6d55d840e1b8c0002fce66443e124e8581f30f9ead2e54fbf6709fb593181f2c",
+ "sha256:72d1507f152abacea81f65fee38e4ef3ac3c02ff8bc16f21d935fd3a8a4ad910",
+ "sha256:74f70cd92669394eaf8d7756d1b195c8032cf7bbbdfce3bc489d4e15b3b8cf73",
+ "sha256:830525361249dc4cd013652b0efad645a385707a5ae49350c894b67d23fbb07c",
+ "sha256:854f22fa361d1ff914c7efa347398374cc7d567bdafa48ac3aa22334650dfba2",
+ "sha256:89caf4425fe88889e2973a8e9a3f6f5f9bbe5dd411d7d521e86428c08a873a4a",
+ "sha256:9158f8fb06747ac17bd237930c4372336edc85b6e13bdc778e60f9d685c3ca37",
+ "sha256:92651580bd46519067e36493acb394ea0607b55b45bd81dd4e26379ed1871f55",
+ "sha256:978258fec36c154b5e250d356c59af7d4c3ba02bef4b99cda90b6029441d797d",
+ "sha256:9823e4789ab70f3ec88724bba1a203f2856331986cd893dedbe3e23a6cfc1e4e",
+ "sha256:9b373c9345c584bb4b5f5b8840df7f4ab48c4cbb7934b58d52c57020d911b856",
+ "sha256:a4a574a19eeb67575a5328a5760bbbb737faa685616586a9f9da4281f940109c",
+ "sha256:aec2d1515d9d39ff270059fd3afbb3b44e6ec5758af73caf18991807138c7118",
+ "sha256:b3695c4f4750bca943b3e1f74ad4be8d29e4aeab927d50772c41359107bd5d5c",
+ "sha256:b3763e7fcade2ff6c8e62340af9277f54336920489ceb6a8cd6cc96da52fcc62",
+ "sha256:b66bb21a23680dee0be66557dc6b02a3152ddb55edf9f6723fa4a93368f7158d",
+ "sha256:b6f22bb64cc39bcb883e5910f99a27b200fdc14cdd79df8696fa96b0005c9444",
+ "sha256:b77015d1cb8fe941be1222a5a8b4e3fbca88180cfa7e2d4a4e58aeabadef0ab7",
+ "sha256:b9ea158775c7c2d3e54530a92da79496fb3fb577c876eec761c23e028f1e216c",
+ "sha256:c20cfebcc149a4c212f6491a5f9ff56f41829cd4f607b5be71bb2d530ef243b1",
+ "sha256:cfded268092a84605f1cc19e5c737f9ce630a8900a3589e9289622db161967e9",
+ "sha256:d1991f1dd95eba69d2cd7708ff6c2bbd2426160ffc73c2b81f617a053ebcb1a8",
+ "sha256:d3022c3007d3267a880b5adcf18c2a9bf1fc64469b394a804886b401959b8742",
+ "sha256:d6814854c02cbcd9c873c0f3286a02e3ac1250625cca822ca6bc1018c5b19f1c",
+ "sha256:d87717959d4d0ee9db08a0f1d80d21eb585aafe30f9b0a54ecf779a69cb015f6",
+ "sha256:e00c14720b8b3b6c23b487e70bd406abafc976ddc50490f645166f111c419c39",
+ "sha256:e60bef2e2416f15fdc05772bf87db06c6a6f9870d1db08fdd019fbec98ae24a9",
+ "sha256:e78e9dcbf4f3853d3ae18a8f9272111242531535ec9e1009fa8ec4a2b74557dc",
+ "sha256:f66460f17c9319ea4f91c165d46840314f0a7c004720b20be58594d162a441d8",
+ "sha256:fa6a5a224b7f4cfb226f4fc55a57e8537fcc096f42219128c2c74c0e7d0953e1",
+ "sha256:fb992c47cb1e5bd6a01e97182400bcc2ba2077080a17fcd7be23aaa6e572e390",
+ "sha256:fd1b9c5adc066db699ccf7fa839189a649afcdd9e02cb5dc9d24e67e7922737d",
+ "sha256:fd556ff16a57a070ce4f31c635953cc44e25244f91a0378c6e9bdfd40fdb249f"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==7.0.1"
},
"decorator": {
"hashes": [
- "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de",
- "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6"
+ "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330",
+ "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"
],
- "version": "==4.4.0"
+ "markers": "python_version >= '3.5'",
+ "version": "==5.1.1"
+ },
+ "dill": {
+ "hashes": [
+ "sha256:a07ffd2351b8c678dfc4a856a3005f8067aea51d6ba6c700796a4d9e280f39f0",
+ "sha256:e5db55f3687856d8fbdab002ed78544e1c4559a130302693d839dfe8f93f2373"
+ ],
+ "markers": "python_version < '3.11'",
+ "version": "==0.3.6"
+ },
+ "executing": {
+ "hashes": [
+ "sha256:0314a69e37426e3608aada02473b4161d4caf5a4b244d1d0c48072b8fee7bacc",
+ "sha256:19da64c18d2d851112f09c287f8d3dbbdf725ab0e569077efb6cdcbd3497c107"
+ ],
+ "version": "==1.2.0"
+ },
+ "flake8": {
+ "hashes": [
+ "sha256:3833794e27ff64ea4e9cf5d410082a8b97ff1a06c16aa3d2027339cd0f1195c7",
+ "sha256:c61007e76655af75e6785a931f452915b371dc48f56efd765247c8fe68f2b181"
+ ],
+ "index": "ia",
+ "version": "==6.0.0"
+ },
+ "flake8-annotations": {
+ "hashes": [
+ "sha256:11f09efb99ae63c8f9d6b492b75fe147fbc323179fddfe00b2e56eefeca42f57",
+ "sha256:a4385158a7a9fc8af1d8820a2f4c8d03387997006a83f5f8bfe5bc6085bdf88a"
+ ],
+ "index": "ia",
+ "version": "==2.9.1"
},
"idna": {
"hashes": [
@@ -768,109 +1591,152 @@
],
"version": "==2.6"
},
- "importlib-metadata": {
+ "iniconfig": {
"hashes": [
- "sha256:aa18d7378b00b40847790e7c27e11673d7fed219354109d0e7b9e5b25dc3ad26",
- "sha256:d5f18a79777f3aa179c145737780282e27b508fc8fd688cb17c7a813e8bd39af"
+ "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
+ "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
],
- "markers": "python_version < '3.8'",
- "version": "==0.23"
+ "version": "==1.1.1"
},
"ipython": {
"hashes": [
- "sha256:c4ab005921641e40a68e405e286e7a1fcc464497e14d81b6914b4fd95e5dee9b",
- "sha256:dd76831f065f17bddd7eaa5c781f5ea32de5ef217592cf019e34043b56895aa1"
+ "sha256:352042ddcb019f7c04e48171b4dd78e4c4bb67bf97030d170e154aac42b656d9",
+ "sha256:882899fe78d5417a0aa07f995db298fa28b58faeba2112d2e3a4c95fe14bb738"
],
"index": "ia",
- "version": "==7.8.0"
- },
- "ipython-genutils": {
- "hashes": [
- "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
- "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
- ],
- "version": "==0.2.0"
+ "version": "==8.7.0"
},
"isort": {
"hashes": [
- "sha256:54da7e92468955c4fceacd0c86bd0ec997b0e1ee80d97f67c35a78b719dccab1",
- "sha256:6e811fcb295968434526407adb8796944f1988c5b65e8139058f2014cbe100fd"
+ "sha256:6db30c5ded9815d813932c04c2f85a360bcdd35fed496f4d8f35495ef0a261b6",
+ "sha256:c033fd0edb91000a7f09527fe5c75321878f98322a77ddcc81adbd83724afb7b"
],
- "version": "==4.3.21"
+ "index": "ia",
+ "version": "==5.11.4"
},
"jedi": {
"hashes": [
- "sha256:786b6c3d80e2f06fd77162a07fed81b8baa22dde5d62896a790a331d6ac21a27",
- "sha256:ba859c74fa3c966a22f2aeebe1b74ee27e2a462f56d3f5f7ca4a59af61bfe42e"
+ "sha256:203c1fd9d969ab8f2119ec0a3342e0b49910045abe6af0a3ae83a5764d54639e",
+ "sha256:bae794c30d07f6d910d32a7048af09b5a39ed740918da923c6b780790ebac612"
],
- "version": "==0.15.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.18.2"
},
"lazy-object-proxy": {
"hashes": [
- "sha256:02b260c8deb80db09325b99edf62ae344ce9bc64d68b7a634410b8e9a568edbf",
- "sha256:18f9c401083a4ba6e162355873f906315332ea7035803d0fd8166051e3d402e3",
- "sha256:1f2c6209a8917c525c1e2b55a716135ca4658a3042b5122d4e3413a4030c26ce",
- "sha256:2f06d97f0ca0f414f6b707c974aaf8829c2292c1c497642f63824119d770226f",
- "sha256:616c94f8176808f4018b39f9638080ed86f96b55370b5a9463b2ee5c926f6c5f",
- "sha256:63b91e30ef47ef68a30f0c3c278fbfe9822319c15f34b7538a829515b84ca2a0",
- "sha256:77b454f03860b844f758c5d5c6e5f18d27de899a3db367f4af06bec2e6013a8e",
- "sha256:83fe27ba321e4cfac466178606147d3c0aa18e8087507caec78ed5a966a64905",
- "sha256:84742532d39f72df959d237912344d8a1764c2d03fe58beba96a87bfa11a76d8",
- "sha256:874ebf3caaf55a020aeb08acead813baf5a305927a71ce88c9377970fe7ad3c2",
- "sha256:9f5caf2c7436d44f3cec97c2fa7791f8a675170badbfa86e1992ca1b84c37009",
- "sha256:a0c8758d01fcdfe7ae8e4b4017b13552efa7f1197dd7358dc9da0576f9d0328a",
- "sha256:a4def978d9d28cda2d960c279318d46b327632686d82b4917516c36d4c274512",
- "sha256:ad4f4be843dace866af5fc142509e9b9817ca0c59342fdb176ab6ad552c927f5",
- "sha256:ae33dd198f772f714420c5ab698ff05ff900150486c648d29951e9c70694338e",
- "sha256:b4a2b782b8a8c5522ad35c93e04d60e2ba7f7dcb9271ec8e8c3e08239be6c7b4",
- "sha256:c462eb33f6abca3b34cdedbe84d761f31a60b814e173b98ede3c81bb48967c4f",
- "sha256:fd135b8d35dfdcdb984828c84d695937e58cc5f49e1c854eb311c4d6aa03f4f1"
- ],
- "version": "==1.4.2"
+ "sha256:0c1c7c0433154bb7c54185714c6929acc0ba04ee1b167314a779b9025517eada",
+ "sha256:14010b49a2f56ec4943b6cf925f597b534ee2fe1f0738c84b3bce0c1a11ff10d",
+ "sha256:4e2d9f764f1befd8bdc97673261b8bb888764dfdbd7a4d8f55e4fbcabb8c3fb7",
+ "sha256:4fd031589121ad46e293629b39604031d354043bb5cdf83da4e93c2d7f3389fe",
+ "sha256:5b51d6f3bfeb289dfd4e95de2ecd464cd51982fe6f00e2be1d0bf94864d58acd",
+ "sha256:6850e4aeca6d0df35bb06e05c8b934ff7c533734eb51d0ceb2d63696f1e6030c",
+ "sha256:6f593f26c470a379cf7f5bc6db6b5f1722353e7bf937b8d0d0b3fba911998858",
+ "sha256:71d9ae8a82203511a6f60ca5a1b9f8ad201cac0fc75038b2dc5fa519589c9288",
+ "sha256:7e1561626c49cb394268edd00501b289053a652ed762c58e1081224c8d881cec",
+ "sha256:8f6ce2118a90efa7f62dd38c7dbfffd42f468b180287b748626293bf12ed468f",
+ "sha256:ae032743794fba4d171b5b67310d69176287b5bf82a21f588282406a79498891",
+ "sha256:afcaa24e48bb23b3be31e329deb3f1858f1f1df86aea3d70cb5c8578bfe5261c",
+ "sha256:b70d6e7a332eb0217e7872a73926ad4fdc14f846e85ad6749ad111084e76df25",
+ "sha256:c219a00245af0f6fa4e95901ed28044544f50152840c5b6a3e7b2568db34d156",
+ "sha256:ce58b2b3734c73e68f0e30e4e725264d4d6be95818ec0a0be4bb6bf9a7e79aa8",
+ "sha256:d176f392dbbdaacccf15919c77f526edf11a34aece58b55ab58539807b85436f",
+ "sha256:e20bfa6db17a39c706d24f82df8352488d2943a3b7ce7d4c22579cb89ca8896e",
+ "sha256:eac3a9a5ef13b332c059772fd40b4b1c3d45a3a2b05e33a361dee48e54a4dad0",
+ "sha256:eb329f8d8145379bf5dbe722182410fe8863d186e51bf034d2075eb8d85ee25b"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==1.8.0"
+ },
+ "matplotlib-inline": {
+ "hashes": [
+ "sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311",
+ "sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304"
+ ],
+ "markers": "python_version >= '3.5'",
+ "version": "==0.1.6"
},
"mccabe": {
"hashes": [
- "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
- "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+ "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325",
+ "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==0.7.0"
+ },
+ "mypy": {
+ "hashes": [
+ "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d",
+ "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6",
+ "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf",
+ "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f",
+ "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813",
+ "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33",
+ "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad",
+ "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05",
+ "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297",
+ "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06",
+ "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd",
+ "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243",
+ "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305",
+ "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476",
+ "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711",
+ "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70",
+ "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5",
+ "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461",
+ "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab",
+ "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c",
+ "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d",
+ "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135",
+ "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93",
+ "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648",
+ "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a",
+ "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb",
+ "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3",
+ "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372",
+ "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb",
+ "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"
],
- "version": "==0.6.1"
+ "index": "ia",
+ "version": "==0.991"
},
- "more-itertools": {
+ "mypy-extensions": {
"hashes": [
- "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832",
- "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4"
+ "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
+ "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
],
- "version": "==7.2.0"
+ "version": "==0.4.3"
},
"packaging": {
"hashes": [
- "sha256:28b924174df7a2fa32c1953825ff29c61e2f5e082343165438812f00d3a7fc47",
- "sha256:d9551545c6d761f3def1677baf08ab2a3ca17c56879e70fecba2fc4dde4ed108"
+ "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3",
+ "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3"
],
- "version": "==19.2"
+ "markers": "python_version >= '3.7'",
+ "version": "==22.0"
},
"parso": {
"hashes": [
- "sha256:63854233e1fadb5da97f2744b6b24346d2750b85965e7e399bec1620232797dc",
- "sha256:666b0ee4a7a1220f65d367617f2cd3ffddff3e205f3f16a0284df30e774c2a9c"
+ "sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0",
+ "sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75"
],
- "version": "==0.5.1"
+ "markers": "python_version >= '3.6'",
+ "version": "==0.8.3"
},
- "pathlib2": {
+ "pathspec": {
"hashes": [
- "sha256:2156525d6576d21c4dcaddfa427fae887ef89a7a9de5cbfe0728b3aafa78427e",
- "sha256:446014523bb9be5c28128c4d2a10ad6bb60769e78bd85658fe44a450674e0ef8"
+ "sha256:3c95343af8b756205e2aba76e843ba9520a24dd84f68c22b9f93251507509dd6",
+ "sha256:56200de4077d9d0791465aa9095a01d421861e405b5096955051deefd697d6f6"
],
- "markers": "python_version < '3.6'",
- "version": "==2.3.4"
+ "markers": "python_version >= '3.7'",
+ "version": "==0.10.3"
},
"pexpect": {
"hashes": [
- "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
- "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb"
+ "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937",
+ "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"
],
"markers": "sys_platform != 'win32'",
- "version": "==4.7.0"
+ "version": "==4.8.0"
},
"pickleshare": {
"hashes": [
@@ -879,173 +1745,332 @@
],
"version": "==0.7.5"
},
+ "platformdirs": {
+ "hashes": [
+ "sha256:1a89a12377800c81983db6be069ec068eee989748799b946cce2a6e80dcc54ca",
+ "sha256:b46ffafa316e6b83b47489d240ce17173f123a9b9c83282141c3daf26ad9ac2e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==2.6.0"
+ },
"pluggy": {
"hashes": [
- "sha256:0db4b7601aae1d35b4a033282da476845aa19185c1e6964b25cf324b5e4ec3e6",
- "sha256:fa5fa1622fa6dd5c030e9cad086fa19ef6a0cf6d7a2d12318e10cb49d6d68f34"
+ "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
+ "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
],
- "version": "==0.13.0"
+ "markers": "python_version >= '3.6'",
+ "version": "==1.0.0"
},
"prompt-toolkit": {
"hashes": [
- "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780",
- "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1",
- "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55"
+ "sha256:3e163f254bef5a03b146397d7c1963bd3e2812f0964bb9a24e6ec761fd28db63",
+ "sha256:aa64ad242a462c5ff0363a7b9cfe696c20d55d9fc60c11fd8e632d064804d305"
],
- "version": "==2.0.9"
+ "markers": "python_full_version >= '3.6.2'",
+ "version": "==3.0.36"
},
"ptyprocess": {
"hashes": [
- "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
- "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
+ "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35",
+ "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"
],
- "version": "==0.6.0"
+ "version": "==0.7.0"
+ },
+ "pure-eval": {
+ "hashes": [
+ "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350",
+ "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3"
+ ],
+ "version": "==0.2.2"
},
"py": {
"hashes": [
- "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa",
- "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"
+ "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
+ "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
],
- "version": "==1.8.0"
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+ "version": "==1.11.0"
+ },
+ "pycodestyle": {
+ "hashes": [
+ "sha256:347187bdb476329d98f695c213d7295a846d1152ff4fe9bacb8a9590b8ee7053",
+ "sha256:8a4eaf0d0495c7395bdab3589ac2db602797d76207242c17d470186815706610"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2.10.0"
+ },
+ "pyflakes": {
+ "hashes": [
+ "sha256:ec55bf7fe21fff7f1ad2f7da62363d749e2a470500eab1b555334b67aa1ef8cf",
+ "sha256:ec8b276a6b60bd80defed25add7e439881c19e64850afd9b346283d4165fd0fd"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==3.0.1"
},
"pygments": {
"hashes": [
- "sha256:71e430bc85c88a430f000ac1d9b331d2407f681d6f6aec95e8bcfbc3df5b0127",
- "sha256:881c4c157e45f30af185c1ffe8d549d48ac9127433f2c380c24b84572ad66297"
+ "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
+ "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
],
- "version": "==2.4.2"
+ "markers": "python_version >= '3.6'",
+ "version": "==2.13.0"
},
"pylint": {
"hashes": [
- "sha256:2d64b4b8fa044480b1a49d47535da53557f8f426b8c5bd6a23beb65e905101a1",
- "sha256:6cbd124a1a5ed1fd3f3fed4178a6c2ba166862ea0dac6ab2ff8d9f0998b13e5c"
+ "sha256:18783cca3cfee5b83c6c5d10b3cdb66c6594520ffae61890858fe8d932e1c6b4",
+ "sha256:349c8cd36aede4d50a0754a8c0218b43323d13d5d88f4b2952ddfe3e169681eb"
],
"index": "ia",
- "version": "==2.4.1"
+ "version": "==2.15.9"
},
- "pyparsing": {
+ "pytest": {
"hashes": [
- "sha256:6f98a7b9397e206d78cc01df10131398f1c8b8510a2f4d97d9abd82e1aacdd80",
- "sha256:d9338df12903bbf5d65a0e4e87c2161968b10d2e489652bb47001d82a9b028b4"
+ "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
+ "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
],
- "version": "==2.4.2"
+ "index": "ia",
+ "version": "==6.2.5"
},
- "pytest": {
+ "pytest-cov": {
"hashes": [
- "sha256:813b99704b22c7d377bbd756ebe56c35252bb710937b46f207100e843440b3c2",
- "sha256:cc6620b96bc667a0c8d4fa592a8c9c94178a1bd6cc799dbb057dfd9286d31a31"
+ "sha256:2feb1b751d66a8bd934e5edfa2e961d11309dc37b73b0eabe73b5945fee20f6b",
+ "sha256:996b79efde6433cdbd0088872dbc5fb3ed7fe1578b68cdbba634f14bb8dd0470"
],
"index": "ia",
- "version": "==5.1.3"
+ "version": "==4.0.0"
},
- "pytest-cov": {
+ "pytest-mock": {
"hashes": [
- "sha256:2b097cde81a302e1047331b48cadacf23577e431b61e9c6f49a1170bbe3d3da6",
- "sha256:e00ea4fdde970725482f1f35630d12f074e121a23801aabf2ae154ec6bdd343a"
+ "sha256:f4c973eeae0282963eb293eb173ce91b091a79c1334455acfac9ddee8a1c784b",
+ "sha256:fbbdb085ef7c252a326fd8cdcac0aa3b1333d8811f131bdcc701002e1be7ed4f"
],
"index": "ia",
- "version": "==2.7.1"
+ "version": "==3.10.0"
},
"pytest-pylint": {
"hashes": [
- "sha256:8c38ea779e540e27ec4378b0820d906006e09f4ac834defbd886abbf57c7d2ec",
- "sha256:a4f5e5007f88c2095dcac799e9f7eed3d7e7a2e657596e26093814980ff5fa20",
- "sha256:a574c246535308f8f6ceac10fa82f8fffffa837071f7985b22515895185700c1"
+ "sha256:b51d3f93bed9c192e2b046f16520981bee5abe7bd61b070306e7ee685219fdd3",
+ "sha256:d88e83c1023c641548a9ec3567707ceee7616632a986af133426d4a74d066932"
],
"index": "ia",
- "version": "==0.14.1"
+ "version": "==0.19.0"
},
"pytest-pythonpath": {
"hashes": [
- "sha256:63fc546ace7d2c845c1ee289e8f7a6362c2b6bae497d10c716e58e253e801d62"
+ "sha256:64e195b23a8f8c0c631fb16882d9ad6fa4137ed1f2961ddd15d52065cd435db6",
+ "sha256:e73e11dab2f0b83e73229e261242b251f0a369d7f527dbfec068822fd26a6ce5"
],
"index": "ia",
- "version": "==0.7.3"
+ "version": "==0.7.4"
},
"requests": {
"hashes": [
- "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
- "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
+ "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
+ "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
],
"index": "ia",
- "version": "==2.22.0"
+ "version": "==2.28.1"
},
"responses": {
"hashes": [
- "sha256:502d9c0c8008439cfcdef7e251f507fcfdd503b56e8c0c87c3c3e3393953f790",
- "sha256:97193c0183d63fba8cd3a041c75464e4b09ea0aff6328800d1546598567dde0b"
+ "sha256:396acb2a13d25297789a5866b4881cf4e46ffd49cc26c43ab1117f40b973102e",
+ "sha256:dcf294d204d14c436fddcc74caefdbc5764795a40ff4e6a7740ed8ddbf3294be"
],
"index": "ia",
- "version": "==0.10.6"
+ "version": "==0.22.0"
},
"six": {
"hashes": [
- "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
- "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+ "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==1.16.0"
+ },
+ "stack-data": {
+ "hashes": [
+ "sha256:32d2dd0376772d01b6cb9fc996f3c8b57a357089dec328ed4b6553d037eaf815",
+ "sha256:cbb2a53eb64e5785878201a97ed7c7b94883f48b87bfb0bbe8b623c74679e4a8"
],
- "version": "==1.12.0"
+ "version": "==0.6.2"
+ },
+ "toml": {
+ "hashes": [
+ "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+ "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+ ],
+ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'",
+ "version": "==0.10.2"
+ },
+ "tomli": {
+ "hashes": [
+ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
+ "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
+ ],
+ "version": "==2.0.1"
+ },
+ "tomlkit": {
+ "hashes": [
+ "sha256:07de26b0d8cfc18f871aec595fda24d95b08fef89d147caa861939f37230bf4b",
+ "sha256:71b952e5721688937fb02cf9d354dbcf0785066149d2855e44531ebdd2b65d73"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==0.11.6"
},
"traitlets": {
"hashes": [
- "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
- "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
- ],
- "version": "==4.3.2"
- },
- "typed-ast": {
- "hashes": [
- "sha256:023625bfa9359e29bd6e24cac2a4503495b49761d48a5f1e38333fc4ac4d93fe",
- "sha256:07591f7a5fdff50e2e566c4c1e9df545c75d21e27d98d18cb405727ed0ef329c",
- "sha256:153e526b0f4ffbfada72d0bb5ffe8574ba02803d2f3a9c605c8cf99dfedd72a2",
- "sha256:3ad2bdcd46a4a1518d7376e9f5016d17718a9ed3c6a3f09203d832f6c165de4a",
- "sha256:3ea98c84df53ada97ee1c5159bb3bc784bd734231235a1ede14c8ae0775049f7",
- "sha256:51a7141ccd076fa561af107cfb7a8b6d06a008d92451a1ac7e73149d18e9a827",
- "sha256:52c93cd10e6c24e7ac97e8615da9f224fd75c61770515cb323316c30830ddb33",
- "sha256:6344c84baeda3d7b33e157f0b292e4dd53d05ddb57a63f738178c01cac4635c9",
- "sha256:64699ca1b3bd5070bdeb043e6d43bc1d0cebe08008548f4a6bee782b0ecce032",
- "sha256:74903f2e56bbffe29282ef8a5487d207d10be0f8513b41aff787d954a4cf91c9",
- "sha256:7891710dba83c29ee2bd51ecaa82f60f6bede40271af781110c08be134207bf2",
- "sha256:91976c56224e26c256a0de0f76d2004ab885a29423737684b4f7ebdd2f46dde2",
- "sha256:9bad678a576ecc71f25eba9f1e3fd8d01c28c12a2834850b458428b3e855f062",
- "sha256:b4726339a4c180a8b6ad9d8b50d2b6dc247e1b79b38fe2290549c98e82e4fd15",
- "sha256:ba36f6aa3f8933edf94ea35826daf92cbb3ec248b89eccdc053d4a815d285357",
- "sha256:bbc96bde544fd19e9ef168e4dfa5c3dfe704bfa78128fa76f361d64d6b0f731a",
- "sha256:c0c927f1e44469056f7f2dada266c79b577da378bbde3f6d2ada726d131e4824",
- "sha256:c0f9a3708008aa59f560fa1bd22385e05b79b8e38e0721a15a8402b089243442",
- "sha256:f0bf6f36ff9c5643004171f11d2fdc745aa3953c5aacf2536a0685db9ceb3fb1",
- "sha256:f5be39a0146be663cbf210a4d95c3c58b2d7df7b043c9047c5448e358f0550a2",
- "sha256:fcd198bf19d9213e5cbf2cde2b9ef20a9856e716f76f9476157f90ae6de06cc6"
- ],
- "markers": "implementation_name == 'cpython' and python_version < '3.7'",
- "version": "==1.2.0"
+ "sha256:6cc57d6dc28c85d5365961726ffd19b538739347749e13ebe34e03323a0e8f84",
+ "sha256:c864831efa0ba6576d09b44884b34e41defc18c0d7e720b4a2d6698c842cab3e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==5.8.0"
},
- "urllib3": {
+ "types-beautifulsoup4": {
"hashes": [
- "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
- "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
+ "sha256:c1f803367a2b07ad4fdac40ddbea557010dc4ddd1ee92d801f317eb02e2e3c72",
+ "sha256:d46be8f409ddccb6daaa9d118484185e70bcf552085c39c6d05b157cd1462e04"
],
- "markers": "python_version >= '3.4'",
- "version": "==1.22"
+ "index": "ia",
+ "version": "==4.11.6.1"
},
- "wcwidth": {
+ "types-dateparser": {
"hashes": [
- "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
- "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+ "sha256:5b0c8845167981f68f090894aa371bddbd0371341b90c3f868ac9524cd0a6b69",
+ "sha256:65232f1b3a952476fb98b31ae0a4019efd32635981040149b97b161d5ce2b4da"
],
- "version": "==0.1.7"
+ "index": "ia",
+ "version": "==1.1.4.4"
},
- "wrapt": {
+ "types-pillow": {
"hashes": [
- "sha256:565a021fd19419476b9362b05eeaa094178de64f8361e44468f9e9d7843901e1"
+ "sha256:98b8484ff343676f6f7051682a6cfd26896e993e86b3ce9badfa0ec8750f5405",
+ "sha256:c18d466dc18550d96b8b4a279ff94f0cbad696825b5ad55466604f1daf5709de"
],
- "version": "==1.11.2"
+ "index": "ia",
+ "version": "==9.3.0.4"
},
- "zipp": {
+ "types-psycopg2": {
"hashes": [
- "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e",
- "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335"
+ "sha256:084558d6bc4b2cfa249b06be0fdd9a14a69d307bae5bb5809a2f14cfbaa7a23f",
+ "sha256:bff045579642ce00b4a3c8f2e401b7f96dfaa34939f10be64b0dd3b53feca57d"
],
- "version": "==0.6.0"
+ "index": "ia",
+ "version": "==2.9.21.2"
+ },
+ "types-requests": {
+ "hashes": [
+ "sha256:0ae38633734990d019b80f5463dfa164ebd3581998ac8435f526da6fe4d598c3",
+ "sha256:b6a2fca8109f4fdba33052f11ed86102bddb2338519e1827387137fefc66a98b"
+ ],
+ "index": "ia",
+ "version": "==2.28.11.7"
+ },
+ "types-toml": {
+ "hashes": [
+ "sha256:171bdb3163d79a520560f24ba916a9fc9bff81659c5448a9fea89240923722be",
+ "sha256:b7b5c4977f96ab7b5ac06d8a6590d17c0bf252a96efc03b109c2711fb3e0eafd"
+ ],
+ "version": "==0.10.8.1"
+ },
+ "types-urllib3": {
+ "hashes": [
+ "sha256:ed6b9e8a8be488796f72306889a06a3fc3cb1aa99af02ab8afb50144d7317e49",
+ "sha256:eec5556428eec862b1ac578fb69aab3877995a99ffec9e5a12cf7fbd0cc9daee"
+ ],
+ "version": "==1.26.25.4"
+ },
+ "typing-extensions": {
+ "hashes": [
+ "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
+ "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==4.4.0"
+ },
+ "urllib3": {
+ "hashes": [
+ "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc",
+ "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==1.26.13"
+ },
+ "wcwidth": {
+ "hashes": [
+ "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784",
+ "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"
+ ],
+ "version": "==0.2.5"
+ },
+ "wrapt": {
+ "hashes": [
+ "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3",
+ "sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b",
+ "sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4",
+ "sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2",
+ "sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656",
+ "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3",
+ "sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff",
+ "sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310",
+ "sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a",
+ "sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57",
+ "sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069",
+ "sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383",
+ "sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe",
+ "sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87",
+ "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d",
+ "sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b",
+ "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907",
+ "sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f",
+ "sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0",
+ "sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28",
+ "sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1",
+ "sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853",
+ "sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc",
+ "sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3",
+ "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3",
+ "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164",
+ "sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1",
+ "sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c",
+ "sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1",
+ "sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7",
+ "sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1",
+ "sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320",
+ "sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed",
+ "sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1",
+ "sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248",
+ "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c",
+ "sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456",
+ "sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77",
+ "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef",
+ "sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1",
+ "sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7",
+ "sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86",
+ "sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4",
+ "sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d",
+ "sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d",
+ "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8",
+ "sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5",
+ "sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471",
+ "sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00",
+ "sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68",
+ "sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3",
+ "sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d",
+ "sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735",
+ "sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d",
+ "sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569",
+ "sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7",
+ "sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59",
+ "sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5",
+ "sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb",
+ "sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b",
+ "sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f",
+ "sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462",
+ "sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015",
+ "sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af"
+ ],
+ "markers": "python_version < '3.11'",
+ "version": "==1.14.1"
}
}
}
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..4395f19
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,46 @@
+
+This directory contains `sandcrawler` python code for ingest pipelines, batch
+processing, PDF extraction, etc.
+
+
+## Development Quickstart
+
+As of December 2022, working with this code requires:
+
+- Python 3.8 (specifically, due to version specification in `pipenv`)
+- `pipenv` for python dependency management
+- generic and python-specific build tools (`pkg-config`, `python-dev`, etc)
+- poppler (PDF processing library)
+- libmagic
+- libsodium
+- access to IA internal packages (`devpi.us.archive.org`), specifically for
+ globalwayback and related packages
+
+In production and CI we use Ubuntu Focal (20.04). The CI script for this
+repository (`../.gitlab-ci.yml`) is the best place to look for a complete list
+of dependencies for both development and deployment. Note that our CI system
+runs from our cluster, which resolves the devpi access issue. For developer
+laptops, you may need `sshuttle` or something similar set up to do initial
+package pulls.
+
+It is recommended to set the env variable `PIPENV_VENV_IN_PROJECT=true` when
+working with pipenv. You can include this in a `.env` file.
+
+There is a Makefile which helps with the basics. Eg:
+
+ # install deps using pipenv
+ make deps
+
+ # run python tests
+ make test
+
+ # run code formatting and lint checks
+ make fmt lint
+
+Sometimes when developing it is helpful to enter a shell with pipenv, eg:
+
+ pipenv shell
+
+Often when developing it is helpful (or necessary) to set environment
+variables. `pipenv shell` will read from `.env`, so you can copy and edit
+`example.env`, and it will be used in tests, `pipenv shell`, etc.
diff --git a/python/TODO b/python/TODO
deleted file mode 100644
index 6b05646..0000000
--- a/python/TODO
+++ /dev/null
@@ -1 +0,0 @@
-- refactor extractor common code into a shared file
diff --git a/python/common.py b/python/common.py
deleted file mode 100644
index e596b35..0000000
--- a/python/common.py
+++ /dev/null
@@ -1,99 +0,0 @@
-
-import json
-from datetime import datetime
-
-NORMAL_MIME = (
- 'application/pdf',
- 'application/postscript',
- 'text/html',
- 'text/xml',
-)
-
-def normalize_mime(raw):
- raw = raw.lower()
- for norm in NORMAL_MIME:
- if raw.startswith(norm):
- return norm
-
- # Special cases
- if raw.startswith('application/xml'):
- return 'text/xml'
- if raw.startswith('application/x-pdf'):
- return 'application/pdf'
- return None
-
-
-def test_normalize_mime():
- assert normalize_mime("asdf") is None
- assert normalize_mime("application/pdf") == "application/pdf"
- assert normalize_mime("application/pdf+journal") == "application/pdf"
- assert normalize_mime("Application/PDF") == "application/pdf"
- assert normalize_mime("application/p") is None
- assert normalize_mime("application/xml+stuff") == "text/xml"
- assert normalize_mime("application/x-pdf") == "application/pdf"
- assert normalize_mime("application/x-html") is None
-
-
-def parse_cdx_line(raw_cdx):
-
- cdx = raw_cdx.split()
- if len(cdx) < 11:
- return None
-
- surt = cdx[0]
- dt = cdx[1]
- url = cdx[2]
- mime = normalize_mime(cdx[3])
- http_status = cdx[4]
- key = cdx[5]
- c_size = cdx[8]
- offset = cdx[9]
- warc = cdx[10]
-
- if not (key.isalnum() and c_size.isdigit() and offset.isdigit()
- and http_status == "200" and len(key) == 32 and dt.isdigit()
- and mime != None):
- return None
-
- if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc):
- return None
-
- key = "sha1:{}".format(key)
-
- info = dict(surt=surt, dt=dt, url=url, c_size=int(c_size),
- offset=int(offset), warc=warc)
-
- warc_file = warc.split('/')[-1]
- try:
- dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
- except Exception:
- return None
-
- # 'i' intentionally not set
- heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1)
- return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix}
-
-def parse_ungrobided_line(raw_line):
-
- line = raw_line.strip().split("\t")
- if len(line) != 4:
- return None
-
- key = line[0]
- mime = normalize_mime(line[2])
- try:
- f_c = json.loads(line[1])
- cdx = json.loads(line[3])
- except json.JSONDecodeError:
- return None
-
- if not (key[5:].isalnum() and len(key) == 37 and mime != None):
- print(mime)
- print(key)
- print("FAIL")
- return None
-
- if '-' in (key, mime, f_c, cdx):
- return None
-
- return {'key': key, 'file:mime': mime, 'file:cdx': cdx, 'f:c': f_c}
diff --git a/python/example.env b/python/example.env
new file mode 100644
index 0000000..85af66c
--- /dev/null
+++ b/python/example.env
@@ -0,0 +1,8 @@
+SANDCRAWLER_BLOB_ACCESS_KEY="minioadmin"
+SANDCRAWLER_BLOB_SECRET_KEY="minioadmin"
+IA_ACCESS_KEY="dummy"
+IA_SECRET_KEY="dummy"
+CDX_AUTH_TOKEN="dummy"
+PETABOX_WEBDATA_SECRET="dummy"
+SENTRY_DSN=""
+SANDCRAWLER_WORKING_DIR="/tmp/sandcrawler/"
diff --git a/python/grobid2json.py b/python/grobid2json.py
deleted file mode 100755
index df21883..0000000
--- a/python/grobid2json.py
+++ /dev/null
@@ -1,181 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-NB: adapted to work as a library for PDF extraction. Will probably be
-re-written eventually to be correct, complete, and robust; this is just a
-first iteration.
-
-This script tries to extract everything from a GROBID TEI XML fulltext dump:
-
-- header metadata
-- affiliations
-- references (with context)
-- abstract
-- fulltext
-- tables, figures, equations
-
-A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
-
-- abstract
-- fulltext
-- tables, figures, equations
-
-Prints JSON to stdout, errors to stderr
-"""
-
-import io
-import json
-import argparse
-import xml.etree.ElementTree as ET
-
-xml_ns = "http://www.w3.org/XML/1998/namespace"
-ns = "http://www.tei-c.org/ns/1.0"
-
-def all_authors(elem):
- names = []
- for author in elem.findall('.//{%s}author' % ns):
- pn = author.find('./{%s}persName' % ns)
- if not pn:
- continue
- given_name = pn.findtext('./{%s}forename' % ns) or None
- surname = pn.findtext('./{%s}surname' % ns) or None
- full_name = ' '.join(pn.itertext())
- obj = dict(name=full_name, given_name=given_name, surname=surname)
- ae = author.find('./{%s}affiliation' % ns)
- if ae:
- affiliation = dict()
- for on in ae.findall('./{%s}orgName' % ns):
- affiliation[on.get('type')] = on.text
- addr_e = ae.find('./{%s}address' % ns)
- if addr_e:
- address = dict()
- for t in addr_e.getchildren():
- address[t.tag.split('}')[-1]] = t.text
- if address:
- affiliation['address'] = address
- #affiliation['address'] = {
- # 'post_code': addr.findtext('./{%s}postCode' % ns) or None,
- # 'settlement': addr.findtext('./{%s}settlement' % ns) or None,
- # 'country': addr.findtext('./{%s}country' % ns) or None,
- #}
- obj['affiliation'] = affiliation
- names.append(obj)
- return names
-
-
-def journal_info(elem):
- journal = dict()
- journal['name'] = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
- journal['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
- if journal['publisher'] == '':
- journal['publisher'] = None
- journal['issn'] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
- journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
- journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
- journal['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
- return journal
-
-
-def biblio_info(elem):
- ref = dict()
- ref['id'] = elem.attrib.get('{http://www.w3.org/XML/1998/namespace}id')
- # Title stuff is messy in references...
- ref['title'] = elem.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
- other_title = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
- if other_title:
- if ref['title']:
- ref['journal'] = other_title
- else:
- ref['journal'] = None
- ref['title'] = other_title
- ref['authors'] = all_authors(elem)
- ref['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
- if ref['publisher'] == '':
- ref['publisher'] = None
- date = elem.find('.//{%s}date[@type="published"]' % ns)
- ref['date'] = (date != None) and date.attrib.get('when')
- ref['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
- ref['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
- el = elem.find('.//{%s}ptr[@target]' % ns)
- if el is not None:
- ref['url'] = el.attrib['target']
- # Hand correction
- if ref['url'].endswith(".Lastaccessed"):
- ref['url'] = ref['url'].replace(".Lastaccessed", "")
- else:
- ref['url'] = None
- return ref
-
-
-def teixml2json(content, encumbered=True):
-
- if type(content) == str:
- content = io.StringIO(content)
- elif type(content) == bytes:
- content = io.BytesIO(content)
-
- info = dict()
-
- #print(content)
- #print(content.getvalue())
- tree = ET.parse(content)
- tei = tree.getroot()
-
- header = tei.find('.//{%s}teiHeader' % ns)
- if header is None:
- raise ValueError("XML does not look like TEI format")
- application_tag = header.findall('.//{%s}appInfo/{%s}application' % (ns, ns))[0]
- info['grobid_version'] = application_tag.attrib['version'].strip()
- info['grobid_timestamp'] = application_tag.attrib['when'].strip()
- info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
- info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns)))
- info['journal'] = journal_info(header)
- date = header.find('.//{%s}date[@type="published"]' % ns)
- info['date'] = (date != None) and date.attrib.get('when')
- info['fatcat_release'] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
- info['doi'] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
- if info['doi']:
- info['doi'] = info['doi'].lower()
-
- refs = []
- for (i, bs) in enumerate(tei.findall('.//{%s}listBibl/{%s}biblStruct' % (ns, ns))):
- ref = biblio_info(bs)
- ref['index'] = i
- refs.append(ref)
- info['citations'] = refs
-
- text = tei.find('.//{%s}text' % (ns))
- print(text.attrib)
- info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang
-
- if encumbered:
- el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns))
- info['abstract'] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find('.//{%s}text/{%s}body' % (ns, ns))
- info['body'] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns))
- info['acknowledgement'] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
- info['annex'] = (el or None) and " ".join(el.itertext()).strip()
-
- return info
-
-def main(): # pragma no cover
- parser = argparse.ArgumentParser(
- description="GROBID TEI XML to JSON",
- usage="%(prog)s [options] <teifile>...")
- parser.add_argument("--no-encumbered",
- action="store_true",
- help="don't include ambiguously copyright encumbered fields (eg, abstract, body)")
- parser.add_argument("teifiles", nargs='+')
-
- args = parser.parse_args()
-
- for filename in args.teifiles:
- content = open(filename, 'r')
- print(json.dumps(
- teixml2json(content,
- encumbered=(not args.no_encumbered))))
-
-if __name__=='__main__': # pragma no cover
- main()
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index f21d088..3ffac98 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -1,19 +1,28 @@
#!/usr/bin/env python3
-
"""
These are generally for running one-off tasks from the command line. Output
might go to stdout, or might go to Kafka topic.
Example of large parallel run, locally:
- cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+ cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
"""
-import sys
import argparse
-import datetime
+import json
+import sys
+
+from grobid_tei_xml import parse_document_xml
from sandcrawler import *
+from sandcrawler.grobid import CrossrefRefsWorker
+
+
+def run_single(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ resp = grobid_client.process_fulltext(blob=args.pdf_file.read())
+ resp["_metadata"] = grobid_client.metadata(resp)
+ print(json.dumps(resp, sort_keys=True))
def run_extract_json(args):
@@ -28,77 +37,162 @@ def run_extract_json(args):
pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
+
def run_extract_cdx(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
wayback_client = WaybackClient()
if args.jobs > 1:
worker = GrobidWorker(grobid_client, wayback_client, sink=None)
multi_worker = MultiprocessWrapper(worker, args.sink)
- pusher = CdxLinePusher(multi_worker, args.cdx_file,
- filter_http_statuses=[200], filter_mimetypes=['application/pdf'],
- batch_size=args.jobs)
+ pusher = CdxLinePusher(
+ multi_worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=["application/pdf"],
+ batch_size=args.jobs,
+ )
else:
worker = GrobidWorker(grobid_client, wayback_client, sink=args.sink)
- pusher = CdxLinePusher(worker, args.cdx_file,
- filter_http_statuses=[200], filter_mimetypes=['application/pdf'])
+ pusher = CdxLinePusher(
+ worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=["application/pdf"],
+ )
pusher.run()
+
def run_extract_zipfile(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
- worker = GrobidBlobWorker(grobid_client, sink=args.sink)
- pusher = ZipfilePusher(worker, args.zip_file)
+ if args.jobs > 1:
+ print("multi-processing: {}".format(args.jobs), file=sys.stderr)
+ worker = GrobidBlobWorker(grobid_client, sink=None)
+ multi_worker = MultiprocessWrapper(worker, args.sink, jobs=args.jobs)
+ pusher = ZipfilePusher(multi_worker, args.zip_file, batch_size=args.jobs)
+ else:
+ worker = GrobidBlobWorker(grobid_client, sink=args.sink)
+ pusher = ZipfilePusher(worker, args.zip_file)
+ pusher.run()
+
+
+def run_transform(args):
+ grobid_client = GrobidClient()
+ for line in args.json_file:
+ if not line.strip():
+ continue
+ line = json.loads(line)
+ if args.metadata_only:
+ out = grobid_client.metadata(line)
+ else:
+ tei_doc = parse_document_xml(line["tei_xml"])
+ out = tei_doc.to_legacy_dict()
+ if out:
+ if "source" in line:
+ out["source"] = line["source"]
+ print(json.dumps(out))
+
+
+def run_parse_crossref_refs(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ worker = CrossrefRefsWorker(grobid_client, sink=args.sink)
+ pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
+
def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--kafka-mode',
- action='store_true',
- help="send output to Kafka (not stdout)")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('-j', '--jobs',
- default=8, type=int,
- help="parallelism for batch CPU jobs")
- parser.add_argument('--grobid-host',
- default="http://grobid.qa.fatcat.wiki",
- help="GROBID API host/port")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+ )
+ parser.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
subparsers = parser.add_subparsers()
- sub_extract_json = subparsers.add_parser('extract-json')
+ sub_single = subparsers.add_parser("single")
+ sub_single.set_defaults(func=run_single)
+ sub_single.add_argument(
+ "pdf_file",
+ help="path to PDF file to process",
+ type=argparse.FileType("rb"),
+ )
+
+ sub_extract_json = subparsers.add_parser(
+ "extract-json",
+ help="for each JSON line with CDX info, fetches PDF and does GROBID extraction",
+ )
sub_extract_json.set_defaults(func=run_extract_json)
- sub_extract_json.add_argument('json_file',
+ sub_extract_json.add_argument(
+ "json_file",
help="JSON file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_cdx = subparsers.add_parser('extract-cdx')
+ sub_extract_cdx = subparsers.add_parser(
+ "extract-cdx", help="for each CDX line, fetches PDF and does GROBID extraction"
+ )
sub_extract_cdx.set_defaults(func=run_extract_cdx)
- sub_extract_cdx.add_argument('cdx_file',
+ sub_extract_cdx.add_argument(
+ "cdx_file",
help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
- sub_extract_zipfile = subparsers.add_parser('extract-zipfile')
+ sub_extract_zipfile = subparsers.add_parser(
+ "extract-zipfile",
+ help="opens zipfile, iterates over PDF files inside and does GROBID extract for each",
+ )
sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
- sub_extract_zipfile.add_argument('zip_file',
- help="zipfile with PDFs to extract",
- type=str)
+ sub_extract_zipfile.add_argument("zip_file", help="zipfile with PDFs to extract", type=str)
+
+ sub_parse_crossref_refs = subparsers.add_parser(
+ "parse-crossref-refs",
+ help="reads Crossref metadata records, parses any unstructured refs with GROBID",
+ )
+ sub_parse_crossref_refs.set_defaults(func=run_parse_crossref_refs)
+ sub_parse_crossref_refs.add_argument(
+ "json_file",
+ help="JSON-L file to process (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
+ sub_transform = subparsers.add_parser("transform")
+ sub_transform.set_defaults(func=run_transform)
+ sub_transform.add_argument(
+ "--metadata-only",
+ action="store_true",
+ help="Only pass through bibliographic metadata, not fulltext",
+ )
+ sub_transform.add_argument(
+ "json_file",
+ help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field",
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
- print("tell me what to do!")
+ parser.print_help(file=sys.stderr)
sys.exit(-1)
args.sink = None
if args.kafka_mode:
produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env)
print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
- args.sink = KafkaGrobidSink(kafka_hosts=args.kafka_hosts,
- produce_topic=produce_topic)
+ args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
index bc814de..493c9e7 100755
--- a/python/ia_pdf_match.py
+++ b/python/ia_pdf_match.py
@@ -1,8 +1,7 @@
#!/usr/bin/env python3
-
"""
Input is IA item metadata JSON.
-Ouput is insertable fatcat "match" JSON
+Output is insertable fatcat "match" JSON
- md5
- sha1
@@ -22,87 +21,93 @@ When invoking import matched, be sure to:
--default-mimetype application/pdf
"""
-import sys
import json
+import sys
+from typing import Any, Dict, Optional
-def parse(obj):
- if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
- sys.stderr.write('skip: test item\n')
+
+def parse(obj: dict) -> Optional[Dict[str, Any]]:
+ if obj["metadata"]["identifier"].endswith("-test") or obj["metadata"].get("test"):
+ print("skip: test item", file=sys.stderr)
return None
extid_type = None
extid = None
- if obj['metadata']['identifier'].startswith('arxiv-'):
- extid_type = 'arxiv'
- extid = obj['metadata'].get('source')
+ if obj["metadata"]["identifier"].startswith("arxiv-"):
+ extid_type = "arxiv"
+ extid = obj["metadata"].get("source")
if not extid:
- sys.stderr.write('skip: no source\n')
+ print("skip: no source", file=sys.stderr)
return None
- assert extid.startswith('http://arxiv.org/abs/')
- extid = extid.replace('http://arxiv.org/abs/', '')
- #print(extid)
- assert '/' in extid or '.' in extid
- if not 'v' in extid or not extid[-1].isdigit():
- sys.stderr.write('skip: non-versioned arxiv_id\n')
+ assert extid.startswith("http://arxiv.org/abs/")
+ extid = extid.replace("http://arxiv.org/abs/", "")
+ # print(extid)
+ assert "/" in extid or "." in extid
+ if "v" not in extid or not extid[-1].isdigit():
+ print("skip: non-versioned arxiv_id", file=sys.stderr)
return None
- elif obj['metadata']['identifier'].startswith('paper-doi-10_'):
- extid_type = 'doi'
- extid = obj['metadata']['identifier-doi']
+ elif obj["metadata"]["identifier"].startswith("paper-doi-10_"):
+ extid_type = "doi"
+ extid = obj["metadata"]["identifier-doi"]
assert extid.startswith("10.")
- elif obj['metadata']['identifier'].startswith('pubmed-PMC'):
- extid_type = 'pmcid'
- extid = obj['metadata']['identifier'].replace('pubmed-', '')
+ elif obj["metadata"]["identifier"].startswith("pubmed-PMC"):
+ extid_type = "pmcid"
+ extid = obj["metadata"]["identifier"].replace("pubmed-", "")
assert extid.startswith("PMC")
int(extid[3:])
- elif obj['metadata']['identifier'].startswith('jstor-'):
- extid_type = 'jstor'
- extid = obj['metadata']['identifier'].replace('jstor-', '')
+ elif obj["metadata"]["identifier"].startswith("jstor-"):
+ extid_type = "jstor"
+ extid = obj["metadata"]["identifier"].replace("jstor-", "")
int(extid)
else:
raise NotImplementedError()
pdf_file = None
- for f in obj['files']:
- if f['source'] == "original" and "PDF" in f['format']:
+ for f in obj["files"]:
+ if f["source"] == "original" and "PDF" in f["format"]:
pdf_file = f
break
if not pdf_file:
- sys.stderr.write('skip: no PDF found: {}\n'.format(obj['metadata']['identifier']))
- #for f in obj['files']:
- # sys.stderr.write(f['format'] + "\n")
+ print("skip: no PDF found: {}".format(obj["metadata"]["identifier"]), file=sys.stderr)
+ # for f in obj['files']:
+ # print(f['format'], file=sys.stderr)
return None
- assert pdf_file['name'].endswith('.pdf')
+ assert pdf_file["name"].endswith(".pdf")
match = {
- 'md5': pdf_file['md5'],
- 'sha1': pdf_file['sha1'],
- 'size': int(pdf_file['size']),
- 'mimetype': 'application/pdf',
- 'urls': [
+ "md5": pdf_file["md5"],
+ "sha1": pdf_file["sha1"],
+ "size": int(pdf_file["size"]),
+ "mimetype": "application/pdf",
+ "urls": [
"https://archive.org/download/{}/{}".format(
- obj['metadata']['identifier'],
- pdf_file['name']),
+ obj["metadata"]["identifier"], pdf_file["name"]
+ ),
],
- 'cdx': [],
- 'dois': [],
+ "cdx": [],
+ "dois": [],
}
- if extid_type == 'doi':
- match['dois'] = [extid,]
+ if extid_type == "doi":
+ match["dois"] = [
+ extid,
+ ]
else:
match[extid_type] = extid
return match
-def run():
+
+def run() -> None:
for line in sys.stdin:
if not line:
continue
obj = json.loads(line)
match = parse(obj)
- if match:
- print(json.dumps(match))
+ if match is not None:
+ print(json.dumps(match, sort_keys=True))
+
-if __name__ == '__main__':
+if __name__ == "__main__":
run()
diff --git a/python/ingest_file.py b/python/ingest_file.py
deleted file mode 100755
index 4daa472..0000000
--- a/python/ingest_file.py
+++ /dev/null
@@ -1,386 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-IngestRequest
- - ingest_type
- - base_url
- - release_stage
- - release_id
- - ext_ids
- - doi
- - pmcid
- - ...
- - expect_mimetypes
- - project/source (?)
- - expect_sha1
-
-FileIngestResult
- - request (object)
- - terminal
- - url
- - status_code
- - wayback
- - datetime
- - archive_url
- - file_meta
- - size_bytes
- - md5
- - sha1
- - sha256
- - mimetype
- - grobid
- - version
- - status_code
- - xml_url
- - release_id
- - status (slug)
- - hit (boolean)
-
-Simplified process, assuming totally new URL and PDF file:
-
-- crawl via SPN (including redirects, extraction)
- => terminal
- => wayback
-- calculate file metadata
- => file_meta
-- run GROBID
- => grobid
-
-Optimizations:
-
-- sandcrawler-db lookup of base_url: terminal+wayback
-- GWB CDX lookup of base_url: terminal+wayback
-- sandcrawler-db lookup of GROBID: grobid
-
-New "ingest" table?
-- base_url (indexed)
-- datetime
-- terminal_status
-- terminal_url
-- terminal_sha1
-- hit
-
-"""
-
-import sys
-import json
-import base64
-import hashlib
-import argparse
-import datetime
-import requests
-from http.server import BaseHTTPRequestHandler, HTTPServer
-
-from grobid2json import teixml2json
-
-
-GROBID_ENDPOINT = "http://grobid.qa.fatcat.wiki"
-
-class CDXApiError(Exception):
- pass
-
-class WaybackError(Exception):
- pass
-
-class SavePageNowError(Exception):
- pass
-
-class SandcrawlerDB:
-
- def __init__(self, **kwargs):
- self.api_uri = kwargs.get('api_url',
- "http://aitio.us.archive.org:3030")
-
- def get_cdx(self, url):
- resp = requests.get(self.api_url + "/cdx", params=dict(url='eq.'+url))
- resp.raise_for_status()
- return resp.json() or None
-
- def get_grobid(self, sha1):
- resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.'+sha1))
- resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
- else:
- return None
-
- def get_file_meta(self, sha1):
- resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.'+sha1))
- resp.raise_for_status()
- resp = resp.json()
- if resp:
- return resp[0]
- else:
- return None
-
-def b32_hex(s):
- s = s.strip().split()[0].lower()
- if s.startswith("sha1:"):
- s = s[5:]
- if len(s) != 32:
- return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
-
-def cdx_api_lookup(url):
- """
- Returns a CDX dict, or None if not found.
- """
- CDX_API_ENDPOINT = "https://web.archive.org/cdx/search/cdx"
-
- resp = requests.get(CDX_API_ENDPOINT, params={
- 'url': url,
- 'matchType': 'exact',
- 'limit': -1,
- 'filter': 'statuscode:200',
- 'output': 'json',
- })
- if resp.status_code != 200:
- raise CDXApiError(resp.text)
- rj = resp.json()
- if len(rj) <= 1:
- return None
- cdx = rj[1]
- assert len(cdx) == 7 # JSON is short
- cdx = dict(
- surt=cdx[0],
- datetime=cdx[1],
- url=cdx[2],
- mimetype=cdx[3],
- status_code=int(cdx[4]),
- sha1b32=cdx[5],
- sha1hex=b32_hex(cdx[5]),
- )
- return cdx
-
-def parse_html(body):
- raise NotImplementedError()
-
-def save_url_now(url):
- """
- Tries to "save page now"
- """
-
- SPN_ENDPOINT = "https://web.archive.org/save/"
- resp = requests.get(SPN_ENDPOINT + url)
- if resp.status_code != 200:
- raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url))
- cdx = cdx_api_lookup(url)
- body = resp.content
- return (cdx, body)
-
-def get_cdx_and_body(url):
- """
- Returns a CDX dict and body as a tuple.
-
- If there isn't an existing wayback capture, take one now. Raises an
- exception if can't capture, or if CDX API not available.
-
- Raises an exception if can't find/fetch.
-
- TODO:
- - doesn't handle redirects (at CDX layer). could allow 3xx status codes and follow recursively
- """
-
- WAYBACK_ENDPOINT = "https://web.archive.org/web/"
-
- cdx = cdx_api_lookup(url)
- if not cdx:
- return save_url_now(url)
-
- resp = requests.get(WAYBACK_ENDPOINT + cdx['datetime'] + "id_/" + cdx['url'])
- if resp.status_code != 200:
- raise WaybackError(resp.text)
- body = resp.content
- return (cdx, body)
-
-def file_metadata(blob):
- """
- Returns a dict: size_bytes, md5, sha1, sha256
- """
- hashes = [
- hashlib.sha1(),
- hashlib.sha256(),
- hashlib.md5(),
- ]
- for h in hashes:
- h.update(blob)
- return dict(
- size_bytes=len(blob),
- sha1=hashes[0].hexdigest(),
- sha256=hashes[1].hexdigest(),
- md5=hashes[2].hexdigest(),
- )
-
-
-def do_grobid(sha1hex, blob):
- grobid_response = requests.post(
- GROBID_ENDPOINT + "/api/processFulltextDocument",
- files={'input': blob, 'consolidateHeader': '2'},
- )
-
- info = dict(
- sha1hex=sha1hex,
- status_code=grobid_response.status_code,
- )
- # 4 MByte XML size limit; don't record GROBID status on this path
- if len(grobid_response.content) > 4000000:
- info['status'] = 'oversize'
- return info
- if grobid_response.status_code != 200:
- # response.text is .content decoded as utf-8
- info['status'] = 'error'
- info['error_msg'] = grobid_response.text[:10000]
- dict(status='error', description=grobid_response.text)
- return info, dict(status="error", reason="non-200 GROBID HTTP status",
- extra=grobid_response.text)
- else:
- info['status'] = 'success'
-
- metadata = teixml2json(grobid_response.text, encumbered=False)
- year = None
- mdate = metadata.get('date')
- if mdate and len(mdate) >= 4:
- year = int(mdate[0:4])
- info['metadata'] = dict(
- title=metadata.get('title'),
- authors=metadata.get('authors'),
- journal=metadata.get('journal'),
- year=metadata.get('year'),
- # TODO: any other biblio-glutton fields? first-page, volume
- )
- info['version'] = metadata.get('grobid_version')
- info['timestamp'] = metadata.get('grobid_timestamp')
- info['glutton_fatcat'] = metadata.get('fatcat_release')
- # TODO: push to kafka
- return info
-
-def ingest_file(request):
- """
- 1. check sandcrawler-db for base_url
- -> if found, populate terminal+wayback fields
- 2. check CDX for base_url (only 200, past year)
- -> if found, populate terminal+wayback fields
- 3. if we have wayback, fetch that. otherwise do recursive SPN crawl
- -> populate terminal+wayback
- 4. calculate file_meta
- -> populate file_meta
- 5. check sandcrawler-db for GROBID XML
- 6. run GROBID if we didn't already
- -> push results to minio+sandcrawler-db
- 7. decide if this was a hit
-
- In all cases, print JSON status, and maybe push to sandcrawler-db
- """
-
- response = dict(request=request)
- url = request['base_url']
- while url:
- (cdx_dict, body) = get_cdx_and_body(url)
- sys.stderr.write("CDX hit: {}\n".format(cdx_dict))
-
- response['cdx'] = cdx_dict
- response['terminal'] = dict()
- if 'html' in cdx_dict['mimetype']:
- page_metadata = parse_html(body)
- if page_metadata.get('pdf_url'):
- url = page_metadata.get('pdf_url')
- continue
- response['terminal']['html'] = page_metadata
- response['status'] = 'no-pdf-link'
- return response
- elif 'pdf' in cdx_dict['mimetype']:
- break
- else:
- response['status'] = 'other-mimetype'
- return response
-
- # if we got here, we have a PDF
- response['file_meta'] = file_metadata(body)
- sha1hex = response['file_meta']['sha1']
-
- # do GROBID
- response['grobid'] = do_grobid(sha1hex, body)
- sys.stderr.write("GROBID status: {}\n".format(response['grobid']['status']))
-
- # Ok, now what?
- sys.stderr.write("GOT TO END\n")
- response['status'] = "success"
- response['hit'] = True
- return response
-
-def run_single_ingest(args):
- request = dict(
- base_url=args.url,
- ext_ids=dict(doi=args.doi),
- release_id=args.release_id,
- )
- result = ingest_file(request)
- print(json.dumps(result))
- return result
-
-def run_requests(args):
- for l in args.json_file:
- request = json.loads(l.strip())
- result = ingest_file(request)
- print(json.dumps(result))
-
-class IngestFileRequestHandler(BaseHTTPRequestHandler):
- def do_POST(self):
- if self.path != "/ingest":
- self.send_response(404)
- self.end_headers()
- self.wfile.write("404: Not Found")
- return
- length = int(self.headers.get('content-length'))
- request = json.loads(self.rfile.read(length).decode('utf-8'))
- print("Got request: {}".format(request))
- result = ingest_file(request)
- self.send_response(200)
- self.end_headers()
- self.wfile.write(json.dumps(result))
-
-def run_api(args):
- port = 8083
- print("Listening on localhost:{}".format(port))
- server = HTTPServer(('', port), IngestFileRequestHandler)
- server.serve_forever()
-
-def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--api-host-url',
- default="http://localhost:9411/v0",
- help="fatcat API host/port to use")
- subparsers = parser.add_subparsers()
-
- sub_single= subparsers.add_parser('single')
- sub_single.set_defaults(func=run_single_ingest)
- sub_single.add_argument('--release-id',
- help="(optional) existing release ident to match to")
- sub_single.add_argument('--doi',
- help="(optional) existing release DOI to match to")
- sub_single.add_argument('url',
- help="URL of paper to fetch")
-
- sub_requests = subparsers.add_parser('requests')
- sub_requests.set_defaults(func=run_requests)
- sub_requests.add_argument('json_file',
- help="JSON file (request per line) to import from (or stdin)",
- default=sys.stdin, type=argparse.FileType('r'))
-
- sub_api = subparsers.add_parser('api')
- sub_api.set_defaults(func=run_api)
- sub_api.add_argument('--port',
- help="HTTP port to listen on",
- default=8033, type=int)
-
- args = parser.parse_args()
- if not args.__dict__.get("func"):
- sys.stderr.write("tell me what to do!\n")
- sys.exit(-1)
-
- args.func(args)
-
-if __name__ == '__main__':
- main()
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
new file mode 100755
index 0000000..0b74f9f
--- /dev/null
+++ b/python/ingest_tool.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import subprocess
+import sys
+from http.server import HTTPServer
+
+import sentry_sdk
+
+from sandcrawler import GrobidClient, JsonLinePusher, KafkaCompressSink, KafkaSink
+from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker
+from sandcrawler.ingest_fileset import IngestFilesetWorker
+
+
+def run_single_ingest(args):
+ request = dict(
+ ingest_type=args.ingest_type,
+ base_url=args.url,
+ ext_ids=dict(doi=args.doi),
+ fatcat=dict(release_ident=args.release_id),
+ )
+ if args.force_recrawl:
+ request["force_recrawl"] = True
+ if request["ingest_type"] in [
+ "dataset",
+ ]:
+ ingester = IngestFilesetWorker(
+ try_spn2=not args.no_spn2,
+ ingest_file_result_stdout=True,
+ )
+ else:
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ ingester = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ html_quick_mode=args.html_quick_mode,
+ grobid_client=grobid_client,
+ )
+ result = ingester.process(request)
+ print(json.dumps(result, sort_keys=True))
+ return result
+
+
+def run_requests(args):
+ # TODO: switch to using JsonLinePusher
+ file_worker = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ html_quick_mode=args.html_quick_mode,
+ )
+ fileset_worker = IngestFilesetWorker(
+ try_spn2=not args.no_spn2,
+ )
+ for line in args.json_file:
+ request = json.loads(line.strip())
+ if request["ingest_type"] in [
+ "dataset",
+ ]:
+ result = fileset_worker.process(request)
+ else:
+ result = file_worker.process(request)
+ print(json.dumps(result, sort_keys=True))
+
+
+def run_file_requests_backfill(args):
+ """
+ Special mode for persisting GROBID and pdfextract results to Kafka, but
+ printing ingest result to stdout.
+
+ Can be used to batch re-process known files.
+ """
+ grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env)
+ htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env)
+ grobid_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=grobid_topic,
+ )
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ pdftext_sink = KafkaCompressSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=pdftext_topic,
+ )
+ thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=thumbnail_topic,
+ )
+ xmldoc_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=xmldoc_topic,
+ )
+ htmlteixml_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=htmlteixml_topic,
+ )
+ worker = IngestFileWorker(
+ grobid_client=grobid_client,
+ sink=None,
+ grobid_sink=grobid_sink,
+ thumbnail_sink=thumbnail_sink,
+ pdftext_sink=pdftext_sink,
+ xmldoc_sink=xmldoc_sink,
+ htmlteixml_sink=htmlteixml_sink,
+ try_spn2=False,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ )
+ pusher.run()
+
+
+def run_spn_status(args):
+ worker = IngestFileWorker(
+ sink=None,
+ try_spn2=False,
+ )
+
+ resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/system")
+ resp.raise_for_status()
+ print(f"System status: {json.dumps(resp.json(), sort_keys=True)}")
+ resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/user")
+ resp.raise_for_status()
+ print(f"User status: {json.dumps(resp.json(), sort_keys=True)}")
+
+
+def run_api(args):
+ port = 8083
+ print("Listening on localhost:{}".format(port))
+ server = HTTPServer(("", port), IngestFileRequestHandler)
+ server.serve_forever()
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--enable-sentry",
+ action="store_true",
+ help="report exceptions to Sentry",
+ )
+ parser.add_argument("--env", default="dev", help="environment (eg, prod, qa, dev)")
+ subparsers = parser.add_subparsers()
+
+ sub_single = subparsers.add_parser("single", help="ingests a single base URL")
+ sub_single.set_defaults(func=run_single_ingest)
+ sub_single.add_argument(
+ "ingest_type", default="pdf", help="type of ingest (pdf, html, etc)"
+ )
+ sub_single.add_argument(
+ "--release-id", help="(optional) existing release ident to match to"
+ )
+ sub_single.add_argument("--doi", help="(optional) existing release DOI to match to")
+ sub_single.add_argument(
+ "--force-recrawl",
+ action="store_true",
+ help="ignore GWB history and use SPNv2 to re-crawl",
+ )
+ sub_single.add_argument("--no-spn2", action="store_true", help="don't use live web (SPNv2)")
+ sub_single.add_argument(
+ "--html-quick-mode",
+ action="store_true",
+ help="don't fetch individual sub-resources, just use CDX",
+ )
+ sub_single.add_argument("url", help="URL of paper to fetch")
+ sub_single.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+
+ sub_requests = subparsers.add_parser(
+ "requests", help="takes a series of ingest requests (JSON, per line) and runs each"
+ )
+ sub_requests.add_argument(
+ "--no-spn2", action="store_true", help="don't use live web (SPNv2)"
+ )
+ sub_requests.add_argument(
+ "--html-quick-mode",
+ action="store_true",
+ help="don't fetch individual sub-resources, just use CDX",
+ )
+ sub_requests.set_defaults(func=run_requests)
+ sub_requests.add_argument(
+ "json_file",
+ help="JSON file (request per line) to import from (or stdin)",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+
+ sub_api = subparsers.add_parser(
+ "api", help="starts a simple HTTP server that processes ingest requests"
+ )
+ sub_api.set_defaults(func=run_api)
+ sub_api.add_argument("--port", help="HTTP port to listen on", default=8033, type=int)
+
+ sub_file_requests_backfill = subparsers.add_parser(
+ "file-requests-backfill",
+ help="starts a simple HTTP server that processes ingest requests",
+ )
+ sub_file_requests_backfill.set_defaults(func=run_file_requests_backfill)
+ sub_file_requests_backfill.add_argument(
+ "json_file",
+ help="JSON file (request per line) to import from (or stdin)",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_file_requests_backfill.add_argument(
+ "--kafka-hosts",
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use",
+ )
+ sub_file_requests_backfill.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+
+ sub_spn_status = subparsers.add_parser(
+ "spn-status", help="checks save-page-now v2 API status for bot user"
+ )
+ sub_spn_status.set_defaults(func=run_spn_status)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ # configure sentry *after* parsing args
+ if args.enable_sentry:
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
+ args.func(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/kafka_grobid.py b/python/kafka_grobid.py
deleted file mode 100755
index cde7a2d..0000000
--- a/python/kafka_grobid.py
+++ /dev/null
@@ -1,327 +0,0 @@
-#!/usr/bin/env python3
-"""
-Kafka worker that does GROBID extraction from one queue and into another.
-
-Based on the ungrobided Hadoop job code. Does not talk to HBase at all, just
-petabox and GROBID. Will delegate tasks to random GROBID workers.
-
-Lines (tasks) are enqueued using a trivial kafkacat invocation; output is
-persisted in Kakfa (in compressed format), and also drained into HBase by a
-second worker.
-
-Schema of tasks is the 'ungrobided' TSV output. Schema of output is JSON with
-keys:
-
- "key": SHA1 in base32 with prefix, eg, "sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT"
- "grobid0:status_code": HTTP status code (integer)
- "grobid0:status": dict/json
- "grobid0:tei_xml": xml as a single string
- "f:c": dict/json from input
- "file:mime": string from input
- "file:cdx": dict/json from input
- # NOT grobid0:tei_json, grobid0:metadata, or grobid0:quality, which can be
- # re-derived from tei_xml
-
-Requires:
-- requests
-- pykafka
-- wayback/GWB libraries
-"""
-
-# XXX: some broken MRO thing going on in here due to python3 object wrangling
-# in `wayback` library. Means we can't run pylint.
-# pylint: skip-file
-
-import os
-import sys
-import xml
-import json
-import raven
-import struct
-import argparse
-import requests
-import pykafka
-import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
-from gwb.loader import CDXLoaderFactory
-
-from common import parse_ungrobided_line
-from grobid2json import teixml2json
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
-
-# Specific poison-pill rows we should skip
-KEY_DENYLIST = (
- 'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format"
-)
-
-class KafkaGrobidWorker:
-
- def __init__(self, kafka_hosts, consume_topic, produce_topic, **kwargs):
- self.consume_topic = consume_topic
- self.produce_topic = produce_topic
- self.consumer_group = kwargs.get('consumer_group', 'grobid-extraction')
- self.kafka_hosts = kafka_hosts or 'localhost:9092'
- self.grobid_uri = kwargs.get('grobid_uri')
- # /serve/ instead of /download/ doesn't record view count
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
- # gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
- self.mime_filter = ['application/pdf']
- self.rstore = None
- self.produce_max_request_size = 20000000 # Kafka producer batch size tuning; also limit on size of single extracted document
-
- def grobid_process_fulltext(self, content):
- r = requests.post(self.grobid_uri + "/api/processFulltextDocument",
- files={'input': content})
- return r
-
- def parse_line(self, raw_line):
- """Line should be TSV and have non-null fields:
-
- - key (string) (separated in Kafka case)
- - f:c (string, json)
- - file:mime (string)
- - file:cdx (string, json)
- """
-
- if (raw_line.startswith(' ') or raw_line.startswith('#') or raw_line.startswith('\t')):
- return None, dict(status="invalid", reason="line prefix", input=raw_line)
-
- info = parse_ungrobided_line(raw_line)
- if info is None:
- return None, dict(status="invalid", reason="ungrobided parse")
-
- if info['file:mime'] not in self.mime_filter:
- return None, dict(status="skip", reason="mimetype", mimetype=info['file:mime'])
-
- # If warc is not item/file.(w)arc.gz form, skip it
- if len(info['file:cdx']['warc'].split('/')) != 2:
- return None, dict(status="skip", reason="WARC path not petabox item/file", path=info['file:cdx']['warc'])
-
- return info, None
-
- def fetch_warc_content(self, warc_path, offset, c_size):
- warc_uri = self.warc_uri_prefix + warc_path
- if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
- try:
- gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
- except wayback.exception.ResourceUnavailable:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
- except ValueError as ve:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
- except EOFError as eofe:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
- except TypeError as te:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
- # Note: could consider a generic "except Exception" here, as we get so
- # many petabox errors. Do want jobs to fail loud and clear when the
- # whole cluster is down though.
-
- if gwb_record.get_status()[0] != 200:
- return None, dict(status="error",
- reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
-
- try:
- raw_content = gwb_record.open_raw_content().read()
- except IncompleteRead as ire:
- return None, dict(status="error",
- reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
- return raw_content, None
-
- def extract(self, info):
-
- # Fetch data from WARCs in petabox
- original_content, status = self.fetch_warc_content(
- info['file:cdx']['warc'],
- info['file:cdx']['offset'],
- info['file:cdx']['c_size'])
- if status:
- return None, status
-
- info['file:size'] = len(original_content)
-
- # Submit to GROBID
- try:
- grobid_response = self.grobid_process_fulltext(original_content)
- except requests.exceptions.ConnectionError:
- return None, dict(status="error", reason="connection to GROBID worker")
-
- info['grobid0:status_code'] = grobid_response.status_code
-
- # 4 MByte XML size limit; don't record GROBID status on this path
- if len(grobid_response.content) > 4000000:
- info['grobid0:status'] = {'status': 'oversize'}
- return info, dict(status="oversize", reason="TEI response was too large")
-
- if grobid_response.status_code != 200:
- # response.text is .content decoded as utf-8
- info['grobid0:status'] = dict(status='error', description=grobid_response.text)
- return info, dict(status="error", reason="non-200 GROBID HTTP status",
- extra=grobid_response.text)
-
- info['grobid0:status'] = {'status': 'partial'}
- info['grobid0:tei_xml'] = grobid_response.content
- info['grobid0:status'] = {'status': 'success'}
-
- return info, None
-
- def do_work(self, raw_line):
- """
- 1. parse filtered line
- 2. fetch data from wayback
- 3. submit to GROBID
- 4. convert GROBID response to JSON (and metadata)
- 6. determine "quality"
- 6. produce result to kafka
-
- Returns: (grobid_output, status) (both are None or dict)
- If grobid_output is None, error was recovered, status returned.
- Otherwise, we were successful; grobid_output should be JSON serialized
- and published to kafka.
- """
-
- #self.increment_counter('lines', 'total')
-
- # Parse line and filter down
- info, status = self.parse_line(raw_line)
- if info is None:
- #self.increment_counter('lines', status['status'])
- return None, status
- key = info['key']
- if key in KEY_DENYLIST:
- #self.increment_counter('lines', 'denylist')
- return None, dict(status='denylist', key=key)
-
- # Note: this may not get "cleared" correctly
- sentry_client.extra_context(dict(
- row_key=key,
- cdx_url=info['file:cdx']['url'],
- cdx_dt=info['file:cdx']['dt'],
- cdx_warc=info['file:cdx']['warc'],
- ))
-
- # Do the extraction
- info, status = self.extract(info)
- if info is None:
- #self.increment_counter('lines', status['status'])
- status['key'] = key
- return None, status
- extraction_status = status
-
- # Need to encode 'bytes' as 'str' for JSON serialization
- if info.get('grobid0:tei_xml'):
- info['grobid0:tei_xml'] = info['grobid0:tei_xml'].decode('utf-8')
-
- #self.increment_counter('lines', 'success')
-
- grobid_status_code = info.get('grobid0:status_code', None)
- if extraction_status is not None:
- return info, dict(status="partial", key=key,
- grobid_status_code=grobid_status_code,
- reason=extraction_status['reason'])
- else:
- return info, dict(status="success",
- grobid_status_code=grobid_status_code, key=key,
- extra=extraction_status)
-
- def run(self):
-
- # 1. start consumer (in managed/balanced fashion, with consumer group)
- # 2. for each thingie, do the work; if success publish to kafka; either
- # way... print? log?
- # 3. repeat!
-
- print("Starting Kafka GROBID extraction worker...")
- kafka = pykafka.KafkaClient(hosts=self.kafka_hosts, broker_version="2.0.0")
- produce_topic = kafka.topics[self.produce_topic]
- consume_topic = kafka.topics[self.consume_topic]
-
- sequential_failures = 0
- # Configure producer to basically *immediately* publish messages,
- # one-at-a-time, but asynchronously (don't block). Fetch and GROBID
- # process takes a while, and we want to send as soon as processing is
- # done.
- with produce_topic.get_producer(sync=False,
- compression=pykafka.common.CompressionType.GZIP,
- retry_backoff_ms=250,
- max_queued_messages=50,
- min_queued_messages=10,
- linger_ms=5000,
- max_request_size=self.produce_max_request_size) as producer:
- print("Producing to: {}".format(self.produce_topic))
- consumer = consume_topic.get_balanced_consumer(
- consumer_group=self.consumer_group,
- managed=True,
- auto_commit_enable=True,
- auto_commit_interval_ms=30000, # 30 seconds
- # LATEST because best to miss processing than waste time re-process
- auto_offset_reset=pykafka.common.OffsetType.LATEST,
- queued_max_messages=50,
- compacted_topic=True)
- print("Consuming from: {} as {}".format(self.consume_topic, self.consumer_group))
- sys.stdout.flush()
- for msg in consumer:
- grobid_output, status = self.do_work(msg.value.decode('utf-8'))
- if grobid_output:
- print("extracted {}: {}".format(
- grobid_output.get('key'),
- status))
- sys.stdout.flush()
- producer.produce(json.dumps(grobid_output).encode('utf-8'))
- sequential_failures = 0
- else:
- sys.stderr.write("failed to extract: {}\n".format(status))
- sequential_failures += 1
- if sequential_failures > 20:
- sys.stderr.write("too many failures in a row, bailing out\n")
- sys.exit(-1)
-
-
-@sentry_client.capture_exceptions
-def main():
-
- parser = argparse.ArgumentParser()
- parser.add_argument('--kafka-hosts',
- default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-env',
- default="qa",
- help="eg, 'qa' or 'prod'")
- parser.add_argument('--consume-topic',
- default=None,
- help="Kafka topic to consume from")
- parser.add_argument('--produce-topic',
- default=None,
- help="Kafka topic to produce to")
- parser.add_argument('--grobid-uri',
- type=str,
- default='http://localhost:8070',
- help='URI of GROBID API Server')
- parser.add_argument('--warc-uri-prefix',
- type=str,
- default='https://archive.org/serve/',
- help='URI where WARCs can be found')
- args = parser.parse_args()
-
- if args.consume_topic is None:
- args.consume_topic = "sandcrawler-{}.ungrobided".format(args.kafka_env)
- if args.produce_topic is None:
- args.produce_topic = "sandcrawler-{}.grobid-output".format(args.kafka_env)
-
- worker = KafkaGrobidWorker(**args.__dict__)
- worker.run()
-
-if __name__ == '__main__': # pragma: no cover
- main()
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
new file mode 100755
index 0000000..28d6397
--- /dev/null
+++ b/python/pdfextract_tool.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""
+KNOWN ISSUE: thumbnails are not published to kafka in multi-processing mode
+"""
+
+import argparse
+import sys
+
+from sandcrawler import *
+
+
+def run_extract_json(args):
+ wayback_client = WaybackClient()
+ if args.jobs > 1:
+ worker = PdfExtractWorker(wayback_client, sink=None, thumbnail_sink=None)
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
+ else:
+ worker = PdfExtractWorker(
+ wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink
+ )
+ pusher = JsonLinePusher(worker, args.json_file)
+ pusher.run()
+
+
+def run_extract_cdx(args):
+ wayback_client = WaybackClient()
+ if args.jobs > 1:
+ worker = PdfExtractWorker(wayback_client, sink=None, thumbnail_sink=None)
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = CdxLinePusher(
+ multi_worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=["application/pdf"],
+ batch_size=args.jobs,
+ )
+ else:
+ worker = PdfExtractWorker(
+ wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink
+ )
+ pusher = CdxLinePusher(
+ worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=["application/pdf"],
+ )
+ pusher.run()
+
+
+def run_extract_zipfile(args):
+ if args.jobs > 1:
+ print("multi-processing: {}".format(args.jobs), file=sys.stderr)
+ worker = PdfExtractBlobWorker(sink=None, thumbnail_sink=None)
+ multi_worker = MultiprocessWrapper(worker, args.sink, jobs=args.jobs)
+ pusher = ZipfilePusher(multi_worker, args.zip_file, batch_size=args.jobs)
+ else:
+ worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+ pusher = ZipfilePusher(worker, args.zip_file)
+ pusher.run()
+
+
+def run_single(args):
+ worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink)
+ with open(args.pdf_file, "rb") as pdf_file:
+ pdf_bytes = pdf_file.read()
+ worker.push_record(pdf_bytes)
+ worker.finish()
+ if args.thumbnail_sink:
+ args.thumbnail_sink.finish()
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+ )
+ subparsers = parser.add_subparsers()
+
+ sub_extract_json = subparsers.add_parser(
+ "extract-json",
+ help="for each JSON line with CDX info, fetches PDF and does PDF extraction",
+ )
+ sub_extract_json.set_defaults(func=run_extract_json)
+ sub_extract_json.add_argument(
+ "json_file",
+ help="JSON file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
+ sub_extract_cdx = subparsers.add_parser(
+ "extract-cdx", help="for each CDX line, fetches PDF and does PDF extraction"
+ )
+ sub_extract_cdx.set_defaults(func=run_extract_cdx)
+ sub_extract_cdx.add_argument(
+ "cdx_file",
+ help="CDX file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
+ sub_extract_zipfile = subparsers.add_parser(
+ "extract-zipfile",
+ help="opens zipfile, iterates over PDF files inside and does PDF extract for each",
+ )
+ sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
+ sub_extract_zipfile.add_argument("zip_file", help="zipfile with PDFs to extract", type=str)
+
+ sub_single = subparsers.add_parser("single", help="opens single PDF and extracts it")
+ sub_single.set_defaults(func=run_single)
+ sub_single.add_argument("pdf_file", help="single PDF to extract", type=str)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ args.text_sink = None
+ args.thumbnail_sink = None
+ if args.kafka_mode:
+ text_topic = "sandcrawler-{}.pdf-text".format(args.kafka_env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.kafka_env)
+ args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=text_topic)
+ args.thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts, produce_topic=thumbnail_topic
+ )
+ print(
+ "Running in kafka output mode, publishing to {} and {}\n".format(
+ text_topic, thumbnail_topic
+ ),
+ file=sys.stderr,
+ )
+ else:
+ args.sink = None
+ args.thumbnail_sink = None
+
+ args.func(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
new file mode 100755
index 0000000..24b749d
--- /dev/null
+++ b/python/pdftrio_tool.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""
+Basically just a copy of grobid_tool.py, but for PDF classification instead of
+text extraction.
+
+Example of large parallel run, locally:
+
+cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
+"""
+
+import argparse
+import sys
+
+from sandcrawler import *
+
+
+def run_classify_pdf_json(args):
+ pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
+ wayback_client = WaybackClient()
+ if args.jobs > 1:
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode
+ )
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
+ else:
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode
+ )
+ pusher = JsonLinePusher(worker, args.json_file)
+ pusher.run()
+
+
+def run_classify_pdf_cdx(args):
+ pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
+ wayback_client = WaybackClient()
+ if args.jobs > 1:
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode
+ )
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = CdxLinePusher(
+ multi_worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=["application/pdf"],
+ batch_size=args.jobs,
+ )
+ else:
+ worker = PdfTrioWorker(
+ pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode
+ )
+ pusher = CdxLinePusher(
+ worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=["application/pdf"],
+ )
+ pusher.run()
+
+
+def run_classify_pdf_zipfile(args):
+ pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host)
+ worker = PdfTrioBlobWorker(pdftrio_client, sink=args.sink, mode=args.pdftrio_mode)
+ pusher = ZipfilePusher(worker, args.zip_file)
+ pusher.run()
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+ )
+ parser.add_argument(
+ "--pdftrio-host", default="http://pdftrio.qa.fatcat.wiki", help="pdftrio API host/port"
+ )
+ parser.add_argument(
+ "--pdftrio-mode", default="auto", help="which classification mode to use"
+ )
+ subparsers = parser.add_subparsers()
+
+ sub_classify_pdf_json = subparsers.add_parser(
+ "classify-pdf-json",
+ help="for each JSON line with CDX info, fetches PDF and does pdftrio classify_pdfion",
+ )
+ sub_classify_pdf_json.set_defaults(func=run_classify_pdf_json)
+ sub_classify_pdf_json.add_argument(
+ "json_file",
+ help="JSON file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
+ sub_classify_pdf_cdx = subparsers.add_parser(
+ "classify-pdf-cdx",
+ help="for each CDX line, fetches PDF and does pdftrio classify_pdfion",
+ )
+ sub_classify_pdf_cdx.set_defaults(func=run_classify_pdf_cdx)
+ sub_classify_pdf_cdx.add_argument(
+ "cdx_file",
+ help="CDX file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
+ sub_classify_pdf_zipfile = subparsers.add_parser(
+ "classify-pdf-zipfile",
+ help="opens zipfile, iterates over PDF files inside and does pdftrio classify_pdf for each",
+ )
+ sub_classify_pdf_zipfile.set_defaults(func=run_classify_pdf_zipfile)
+ sub_classify_pdf_zipfile.add_argument(
+ "zip_file", help="zipfile with PDFs to classify", type=str
+ )
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ args.sink = None
+ if args.kafka_mode:
+ produce_topic = "sandcrawler-{}.pdftrio-output".format(args.kafka_env)
+ print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
+ args.sink = KafkaSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic)
+
+ args.func(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/persist_tool.py b/python/persist_tool.py
new file mode 100755
index 0000000..e08d66c
--- /dev/null
+++ b/python/persist_tool.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+"""
+Commands for backfilling content from bulk files into postgresql and s3 (seaweedfs).
+
+Normally this is done by workers (in sandcrawler_worker.py) consuming from
+Kafka feeds, but sometimes we have bulk processing output we want to backfill.
+"""
+
+import argparse
+import os
+import sys
+
+from sandcrawler import *
+from sandcrawler.persist import *
+
+
+def run_cdx(args):
+ worker = PersistCdxWorker(
+ db_url=args.db_url,
+ )
+ filter_mimetypes = ["application/pdf"]
+ if args.no_mimetype_filter:
+ filter_mimetypes = None
+ pusher = CdxLinePusher(
+ worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=filter_mimetypes,
+ # allow_octet_stream
+ batch_size=200,
+ )
+ pusher.run()
+
+
+def run_grobid(args):
+ worker = PersistGrobidWorker(
+ db_url=args.db_url,
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ s3_only=args.s3_only,
+ db_only=args.db_only,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=50,
+ )
+ pusher.run()
+
+
+def run_grobid_disk(args):
+ """
+ Writes XML to individual files on disk, and also prints non-XML metadata to
+ stdout as JSON, which can be redirected to a separate file.
+ """
+ worker = PersistGrobidDiskWorker(
+ output_dir=args.output_dir,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ )
+ pusher.run()
+
+
+def run_pdftrio(args):
+ worker = PersistPdfTrioWorker(
+ db_url=args.db_url,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=100,
+ )
+ pusher.run()
+
+
+def run_pdftext(args):
+ worker = PersistPdfTextWorker(
+ db_url=args.db_url,
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ s3_only=args.s3_only,
+ db_only=args.db_only,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=50,
+ )
+ pusher.run()
+
+
+def run_ingest_file_result(args):
+ worker = PersistIngestFileResultWorker(
+ db_url=args.db_url,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=200,
+ )
+ pusher.run()
+
+
+def run_ingest_request(args):
+ worker = PersistIngestRequestWorker(
+ db_url=args.db_url,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=200,
+ )
+ pusher.run()
+
+
+def run_crossref(args):
+ batch_size = 200
+ if args.parse_refs:
+ batch_size = 10
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ worker = PersistCrossrefWorker(
+ db_url=args.db_url,
+ grobid_client=grobid_client,
+ parse_refs=args.parse_refs,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=batch_size,
+ )
+ pusher.run()
+
+
+def run_grobid_refs(args):
+ worker = PersistGrobidRefsWorker(
+ db_url=args.db_url,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=100,
+ )
+ pusher.run()
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--db-url",
+ help="postgresql database connection string",
+ default="postgres:///sandcrawler",
+ )
+ parser.add_argument("--s3-url", help="S3 (seaweedfs) backend URL", default="localhost:9000")
+ parser.add_argument(
+ "--s3-access-key",
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY")
+ or os.environ.get("MINIO_ACCESS_KEY"),
+ )
+ parser.add_argument(
+ "--s3-secret-key",
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY")
+ or os.environ.get("MINIO_SECRET_KEY"),
+ )
+ parser.add_argument(
+ "--s3-bucket", help="S3 (seaweedfs) bucket to persist into", default="sandcrawler-dev"
+ )
+ subparsers = parser.add_subparsers()
+
+ sub_cdx = subparsers.add_parser("cdx", help="backfill a CDX file into postgresql cdx table")
+ sub_cdx.set_defaults(func=run_cdx)
+ sub_cdx.add_argument(
+ "cdx_file",
+ help="CDX file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+ sub_cdx.add_argument(
+ "--no-mimetype-filter",
+ action="store_true",
+ help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)",
+ )
+
+ sub_grobid = subparsers.add_parser(
+ "grobid", help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)"
+ )
+ sub_grobid.set_defaults(func=run_grobid)
+ sub_grobid.add_argument(
+ "json_file",
+ help="grobid file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+ sub_grobid.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_grobid.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)",
+ )
+
+ sub_pdftext = subparsers.add_parser(
+ "pdftext", help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)"
+ )
+ sub_pdftext.set_defaults(func=run_pdftext)
+ sub_pdftext.add_argument(
+ "json_file",
+ help="pdftext file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+ sub_pdftext.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_pdftext.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to sandcrawler-db (don't save TEI-XML to S3)",
+ )
+
+ sub_grobid_disk = subparsers.add_parser(
+ "grobid-disk", help="dump GRBOID output to (local) files on disk"
+ )
+ sub_grobid_disk.set_defaults(func=run_grobid_disk)
+ sub_grobid_disk.add_argument(
+ "json_file",
+ help="grobid file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+ sub_grobid_disk.add_argument("output_dir", help="base directory to output into", type=str)
+
+ sub_pdftrio = subparsers.add_parser(
+ "pdftrio", help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)"
+ )
+ sub_pdftrio.set_defaults(func=run_pdftrio)
+ sub_pdftrio.add_argument(
+ "json_file",
+ help="pdftrio file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
+ sub_ingest_file_result = subparsers.add_parser(
+ "ingest-file-result", help="backfill a ingest_file_result JSON dump into postgresql"
+ )
+ sub_ingest_file_result.set_defaults(func=run_ingest_file_result)
+ sub_ingest_file_result.add_argument(
+ "json_file",
+ help="ingest_file_result file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
+ sub_ingest_request = subparsers.add_parser(
+ "ingest-request", help="backfill a ingest_request JSON dump into postgresql"
+ )
+ sub_ingest_request.set_defaults(func=run_ingest_request)
+ sub_ingest_request.add_argument(
+ "json_file",
+ help="ingest_request to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
+ sub_crossref = subparsers.add_parser(
+ "crossref",
+ help="backfill a crossref JSON dump into postgresql, and extract references at the same time",
+ )
+ sub_crossref.set_defaults(func=run_crossref)
+ sub_crossref.add_argument(
+ "json_file",
+ help="crossref file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+ sub_crossref.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+ sub_crossref.add_argument(
+ "--parse-refs",
+ action="store_true",
+ help="use GROBID to parse any unstructured references (default is to not)",
+ )
+
+ sub_grobid_refs = subparsers.add_parser(
+ "grobid-refs", help="backfill a grobid_refs JSON dump into postgresql"
+ )
+ sub_grobid_refs.set_defaults(func=run_grobid_refs)
+ sub_grobid_refs.add_argument(
+ "json_file",
+ help="grobid_refs to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("Tell me what to do!", file=sys.stderr)
+ sys.exit(-1)
+
+ args.func(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 0000000..2cef007
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,7 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta:__legacy__"
+
+[tool.isort]
+profile = "black"
+line_length = 96
diff --git a/python/pytest.ini b/python/pytest.ini
index 61c9351..18e8cf0 100644
--- a/python/pytest.ini
+++ b/python/pytest.ini
@@ -1,8 +1,5 @@
-
[pytest]
-ignore = setup.py
-
# allow imports from files in current directory
python_paths = .
@@ -11,10 +8,19 @@ python_files = *.py
addopts = --pylint --pylint-rcfile=.pylintrc --pylint-error-types=EF --pylint-jobs=4
-# these are internal to raven (sentry client) and misaka (Markdown client)
+# ignore various third party warnings (in .venv)
filterwarnings =
ignore:.*common_exception_handling.*StopIteration:PendingDeprecationWarning
- ignore:passing extensions and flags as constants is deprecated:DeprecationWarning
ignore:.*deprecated and will be removed in Werkzeug 1.0.*:DeprecationWarning
+ ignore::DeprecationWarning:.*surt
+ ignore::DeprecationWarning:.*urllib3
+ ignore::DeprecationWarning:.*wayback
+ ignore::DeprecationWarning:.*PIL
+ ignore::DeprecationWarning:.*justext
+ ignore::DeprecationWarning:.*internetarchive
+ ignore::DeprecationWarning:.*minio
+ ignore::DeprecationWarning:.*base_reporter
+ ignore::DeprecationWarning:.*loccache
+ ignore:.*pytz-deprecation-shim
log_level = INFO
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 39503fc..469c2a2 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,6 +1,49 @@
-
-from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
-from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime
-from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
-from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError
-
+from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient
+from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker
+from .ia import (
+ CdxApiClient,
+ CdxApiError,
+ CdxPartial,
+ CdxRow,
+ PetaboxError,
+ ResourceResult,
+ SavePageNowBackoffError,
+ SavePageNowClient,
+ SavePageNowError,
+ WarcResource,
+ WaybackClient,
+ WaybackContentError,
+ WaybackError,
+)
+from .ingest_file import IngestFileWorker
+from .ingest_fileset import IngestFilesetWorker
+from .misc import (
+ b32_hex,
+ clean_url,
+ gen_file_metadata,
+ gen_file_metadata_path,
+ parse_cdx_datetime,
+ parse_cdx_line,
+)
+from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker
+from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker
+from .persist import (
+ PersistCdxWorker,
+ PersistGrobidDiskWorker,
+ PersistGrobidWorker,
+ PersistIngestFileResultWorker,
+ PersistIngestRequestWorker,
+ PersistPdfTextWorker,
+ PersistPdfTrioWorker,
+ PersistThumbnailWorker,
+)
+from .workers import (
+ BlackholeSink,
+ CdxLinePusher,
+ JsonLinePusher,
+ KafkaCompressSink,
+ KafkaJsonPusher,
+ KafkaSink,
+ MultiprocessWrapper,
+ ZipfilePusher,
+)
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
new file mode 100644
index 0000000..f9018ec
--- /dev/null
+++ b/python/sandcrawler/db.py
@@ -0,0 +1,650 @@
+import datetime
+import json
+from typing import Any, Dict, List, Optional, Tuple
+
+import psycopg2
+import psycopg2.extras
+
+from .misc import requests_retry_session
+
+
+class SandcrawlerPostgrestClient:
+ def __init__(self, api_url: str = "http://wbgrp-svc506.us.archive.org:3030", **kwargs):
+ self.api_url = api_url
+ self.http_session = requests_retry_session()
+
+ def get_cdx(self, url: str) -> Optional[dict]:
+ resp = self.http_session.get(self.api_url + "/cdx", params=dict(url="eq." + url))
+ resp.raise_for_status()
+ return resp.json() or None
+
+ def get_grobid(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1)
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_pdftrio(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/pdftrio", params=dict(sha1hex="eq." + sha1)
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_pdf_meta(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1)
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_html_meta(self, sha1hex: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/html_meta",
+ params=dict(sha1hex=f"eq.{sha1hex}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_file_meta(self, sha1: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/file_meta", params=dict(sha1hex="eq." + sha1)
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_ingest_file_result(self, ingest_type: str, url: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/ingest_file_result",
+ params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_ingest_fileset_platform(self, ingest_type: str, url: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/ingest_fileset_platform",
+ params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_crossref(self, doi: str) -> Optional[dict]:
+ resp = self.http_session.get(self.api_url + "/crossref", params=dict(doi=f"eq.{doi}"))
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_crossref_with_refs(self, doi: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/crossref_with_refs", params=dict(doi=f"eq.{doi}")
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+ def get_grobid_refs(self, source: str, source_id: str) -> Optional[dict]:
+ resp = self.http_session.get(
+ self.api_url + "/grobid_refs",
+ params=dict(source=f"eq.{source}", source_id=f"eq.{source_id}"),
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+
+class SandcrawlerPostgresClient:
+ def __init__(self, db_url: str, **kwargs):
+ self.conn = psycopg2.connect(db_url)
+
+ def cursor(self) -> psycopg2.extensions.cursor:
+ return self.conn.cursor()
+
+ def commit(self) -> None:
+ self.conn.commit()
+
+ def _inserts_and_updates(self, resp: List[Tuple], on_conflict: str) -> Tuple[int, int]:
+ resp_codes = [int(r[0]) for r in resp]
+ inserts = len([r for r in resp_codes if r == 0])
+ if on_conflict == "update":
+ updates = len([r for r in resp_codes if r != 0])
+ else:
+ updates = 0
+ return (inserts, updates)
+
+ def insert_cdx(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset)
+ VALUES %s
+ ON CONFLICT ON CONSTRAINT cdx_pkey DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+
+ batch = [d for d in batch if d.get("warc_path")]
+ if not batch:
+ return (0, 0)
+ rows = [
+ (
+ d["url"],
+ d["datetime"],
+ d["sha1hex"],
+ d["mimetype"],
+ d["warc_path"],
+ int(d["warc_csize"]),
+ int(d["warc_offset"]),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (url, datetime)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_file_meta(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ file_meta(sha1hex, sha256hex, md5hex, size_bytes, mimetype)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ sha256hex=EXCLUDED.sha256hex,
+ md5hex=EXCLUDED.md5hex,
+ size_bytes=EXCLUDED.size_bytes,
+ mimetype=EXCLUDED.mimetype
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ rows = [
+ (d["sha1hex"], d["sha256hex"], d["md5hex"], int(d["size_bytes"]), d["mimetype"])
+ for d in batch
+ ]
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_grobid(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ grobid (sha1hex, grobid_version, status_code, status, fatcat_release, updated, metadata)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ grobid_version=EXCLUDED.grobid_version,
+ status_code=EXCLUDED.status_code,
+ status=EXCLUDED.status,
+ fatcat_release=EXCLUDED.fatcat_release,
+ updated=EXCLUDED.updated,
+ metadata=EXCLUDED.metadata
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ for r in batch:
+ if r.get("metadata"):
+ # sometimes these are only in metadata; shouldn't pass through
+ # though (to save database space)
+ dupe_fields = ("fatcat_release", "grobid_version")
+ for k in dupe_fields:
+ if k not in r:
+ r[k] = r["metadata"].get(k)
+ r["metadata"].pop(k, None)
+ r["metadata"] = json.dumps(r["metadata"], sort_keys=True)
+ now = datetime.datetime.now()
+ rows = [
+ (
+ d["key"],
+ d.get("grobid_version") or None,
+ d["status_code"],
+ d["status"],
+ d.get("fatcat_release") or None,
+ d.get("updated") or now,
+ d.get("metadata") or None,
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_pdf_meta(
+ self, cur: psycopg2.extensions.cursor, rows: List[Tuple], on_conflict: str = "nothing"
+ ) -> Tuple[int, int]:
+ """
+ batch elements are expected to have .to_sql_tuple() method
+ """
+ sql = """
+ INSERT INTO
+ pdf_meta (sha1hex, updated, status, has_page0_thumbnail, page_count, word_count, page0_height, page0_width, permanent_id, pdf_created, pdf_version, metadata)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=EXCLUDED.updated,
+ status=EXCLUDED.status,
+ has_page0_thumbnail=EXCLUDED.has_page0_thumbnail,
+ page_count=EXCLUDED.page_count,
+ word_count=EXCLUDED.word_count,
+ page0_height=EXCLUDED.page0_height,
+ page0_width=EXCLUDED.page0_width,
+ permanent_id=EXCLUDED.permanent_id,
+ pdf_created=EXCLUDED.pdf_created,
+ pdf_version=EXCLUDED.pdf_version,
+ metadata=EXCLUDED.metadata
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_html_meta(
+ self, cur: psycopg2.extensions.cursor, rows: List[Tuple], on_conflict: str = "nothing"
+ ) -> Tuple[int, int]:
+ """
+ batch elements are expected to have .to_sql_tuple() method
+ """
+ sql = """
+ INSERT INTO
+ html_meta (sha1hex, updated, status, scope, has_teixml, has_thumbnail, word_count, biblio, resources)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=EXCLUDED.updated,
+ status=EXCLUDED.status,
+ scope=EXCLUDED.scope,
+ has_teixml=EXCLUDED.has_teixml,
+ has_thumbnail=EXCLUDED.has_thumbnail,
+ word_count=EXCLUDED.word_count,
+ biblio=EXCLUDED.biblio,
+ resources=EXCLUDED.resources
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_pdftrio(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ pdftrio (sha1hex, updated, status_code, status, pdftrio_version,
+ models_date, ensemble_score, bert_score, linear_score,
+ image_score)
+ VALUES %s
+ ON CONFLICT (sha1hex) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=EXCLUDED.updated,
+ status_code=EXCLUDED.status_code,
+ status=EXCLUDED.status,
+ pdftrio_version=EXCLUDED.pdftrio_version,
+ models_date=EXCLUDED.models_date,
+ ensemble_score=EXCLUDED.ensemble_score,
+ bert_score=EXCLUDED.bert_score,
+ linear_score=EXCLUDED.linear_score,
+ image_score=EXCLUDED.image_score
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ now = datetime.datetime.now()
+ rows = [
+ (
+ d["key"],
+ d.get("updated") or now,
+ d["status_code"],
+ d["status"],
+ d.get("versions", {}).get("pdftrio_version") or None,
+ d.get("versions", {}).get("models_date") or None,
+ d.get("ensemble_score"),
+ d.get("bert_score"),
+ d.get("linear_score"),
+ d.get("image_score"),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_ingest_request(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ ingest_request (link_source, link_source_id, ingest_type, base_url, ingest_request_source, release_stage, request)
+ VALUES %s
+ ON CONFLICT ON CONSTRAINT ingest_request_pkey DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ for r in batch:
+ # in case these fields were already packed into 'request'
+ extra = r.get("request", {})
+ for k in ("ext_ids", "fatcat_release", "edit_extra", "rel"):
+ if r.get(k):
+ extra[k] = r[k]
+ if extra:
+ r["extra"] = json.dumps(extra, sort_keys=True)
+ rows = [
+ (
+ d["link_source"],
+ d["link_source_id"],
+ d["ingest_type"],
+ d["base_url"],
+ d.get("ingest_request_source"),
+ d.get("release_stage") or None,
+ d.get("extra") or None,
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (link_source, link_source_id, ingest_type, base_url)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1], b[2], b[3])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_ingest_file_result(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ ingest_file_result (ingest_type, base_url, hit, status, terminal_url, terminal_dt, terminal_status_code, terminal_sha1hex)
+ VALUES %s
+ ON CONFLICT ON CONSTRAINT ingest_file_result_pkey DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=now(),
+ hit=EXCLUDED.hit,
+ status=EXCLUDED.status,
+ terminal_url=EXCLUDED.terminal_url,
+ terminal_dt=EXCLUDED.terminal_dt,
+ terminal_status_code=EXCLUDED.terminal_status_code,
+ terminal_sha1hex=EXCLUDED.terminal_sha1hex
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ rows = [
+ (
+ d["ingest_type"],
+ d["base_url"],
+ bool(d["hit"]),
+ d["status"],
+ d.get("terminal_url"),
+ d.get("terminal_dt"),
+ d.get("terminal_status_code"),
+ d.get("terminal_sha1hex"),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (ingest_type, base_url)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_ingest_fileset_platform(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "nothing",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ ingest_fileset_platform (ingest_type, base_url, hit, status, platform_name, platform_domain, platform_id, ingest_strategy, total_size, file_count, archiveorg_item_name, archiveorg_item_bundle_path, web_bundle_url, web_bundle_dt, manifest)
+ VALUES %s
+ ON CONFLICT ON CONSTRAINT ingest_fileset_platform_pkeypkey DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ updated=now(),
+ hit=EXCLUDED.hit,
+ status=EXCLUDED.status,
+ platform_name=EXCLUDED.platform_name,
+ platform_domain=EXCLUDED.platform_domain,
+ platform_id=EXCLUDED.platform_id,
+ ingest_strategy=EXCLUDED.ingest_strategy,
+ total_size=EXCLUDED.total_size,
+ file_count=EXCLUDED.file_count,
+ archiveorg_item_name=EXCLUDED.archiveorg_item_name,
+ archiveorg_item_bundle_path=EXCLUDED.archiveorg_item_bundle_path,
+ web_bundle_url=EXCLUDED.web_bundle_url,
+ web_bundle_dt=EXCLUDED.web_bundle_dt,
+ manifest=EXCLUDED.manifest
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ rows = [
+ (
+ d["ingest_type"],
+ d["base_url"],
+ bool(d["hit"]),
+ d["status"],
+ d.get("platform_name"),
+ d.get("platform_domain"),
+ d.get("platform_id"),
+ d.get("ingest_strategy"),
+ d.get("total_size"),
+ d.get("file_count"),
+ d.get("archiveorg_item_name"),
+ d.get("archiveorg_item_bundle_path"),
+ d.get("web_bundle_url"),
+ d.get("web_bundle_dt"),
+ d.get("manifest"),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (ingest_type, base_url)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_crossref(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "update",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ crossref (doi, indexed, record)
+ VALUES %s
+ ON CONFLICT (doi) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ indexed=EXCLUDED.indexed,
+ record=EXCLUDED.record
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ rows = [
+ (
+ d["doi"],
+ d.get("indexed") or None,
+ json.dumps(d["record"], sort_keys=True),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[b[0]] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
+
+ def insert_grobid_refs(
+ self,
+ cur: psycopg2.extensions.cursor,
+ batch: List[Dict[str, Any]],
+ on_conflict: str = "update",
+ ) -> Tuple[int, int]:
+ sql = """
+ INSERT INTO
+ grobid_refs (source, source_id, source_ts, updated, refs_json)
+ VALUES %s
+ ON CONFLICT (source, source_id) DO
+ """
+ if on_conflict.lower() == "nothing":
+ sql += " NOTHING"
+ elif on_conflict.lower() == "update":
+ sql += """ UPDATE SET
+ source_ts=EXCLUDED.source_ts,
+ updated=EXCLUDED.updated,
+ refs_json=EXCLUDED.refs_json
+ """
+ else:
+ raise NotImplementedError("on_conflict: {}".format(on_conflict))
+ sql += " RETURNING xmax;"
+ now = datetime.datetime.now()
+ rows = [
+ (
+ d["source"],
+ d["source_id"],
+ d.get("source_ts") or None,
+ d.get("updated") or now,
+ json.dumps(d["refs_json"], sort_keys=True),
+ )
+ for d in batch
+ ]
+ # filter out duplicate rows by key (sha1hex)
+ row_dict = dict()
+ for b in rows:
+ row_dict[(b[0], b[1])] = b
+ rows = list(row_dict.values())
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ return self._inserts_and_updates(resp, on_conflict)
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
new file mode 100644
index 0000000..5c13318
--- /dev/null
+++ b/python/sandcrawler/fileset_platforms.py
@@ -0,0 +1,832 @@
+import urllib.parse
+from typing import Optional, Tuple
+
+import internetarchive
+
+from sandcrawler.fileset_types import (
+ FilesetManifestFile,
+ FilesetPlatformItem,
+ IngestStrategy,
+ PlatformRestrictedError,
+ PlatformScopeError,
+)
+from sandcrawler.html_metadata import BiblioMetadata
+from sandcrawler.ia import ResourceResult
+from sandcrawler.misc import requests_retry_session
+
+
+class FilesetPlatformHelper:
+ def __init__(self):
+ self.platform_name = "unknown"
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+ """
+ Does this request look like it matches this platform?
+ """
+ raise NotImplementedError()
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+ raise NotImplementedError()
+
+ def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy:
+ assert item.manifest
+ total_size = sum([m.size or 0 for m in item.manifest]) or 0
+ largest_size = max([m.size or 0 for m in item.manifest]) or 0
+ if len(item.manifest) == 1:
+ if total_size < 64 * 1024 * 1024:
+ return IngestStrategy.WebFile
+ else:
+ return IngestStrategy.ArchiveorgFile
+ else:
+ if largest_size < 64 * 1024 * 1024 and total_size < 128 * 1024 * 1024 * 1024:
+ return IngestStrategy.WebFileset
+ else:
+ return IngestStrategy.ArchiveorgFileset
+
+
+class DataverseHelper(FilesetPlatformHelper):
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "dataverse"
+ self.session = requests_retry_session()
+
+ @staticmethod
+ def parse_dataverse_persistentid(pid: str) -> dict:
+ """
+ Parses a persistentId into 5 sections:
+
+ - type (doi or hdl)
+ - authority (eg, DOI prefix)
+ - shoulder (optional, eg 'DVN')
+ - dataset_id (6-digit)
+ - file_id
+
+ The returned dict always has all components, which may be 'None' if optional.
+
+ This is possible because the dataverse software only supports a handful
+ of configurations and persistend identifier types.
+
+ If there is an error parsing, raises a ValueError
+ """
+ id_type = None
+ if pid.startswith("doi:10."):
+ id_type = "doi"
+ pid = pid[4:]
+ elif pid.startswith("hdl:"):
+ id_type = "hdl"
+ pid = pid[4:]
+ else:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ comp = pid.split("/")
+ if len(comp) < 2:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ authority = comp[0]
+ shoulder = None
+ dataset_id = None
+ file_id = None
+ if len(comp[1]) != 6 and len(comp) == 3:
+ shoulder = comp[1]
+ dataset_id = comp[2]
+ elif len(comp[1]) != 6 and len(comp) == 4:
+ shoulder = comp[1]
+ dataset_id = comp[2]
+ file_id = comp[3]
+ elif len(comp[1]) == 6 and len(comp) == 2:
+ dataset_id = comp[1]
+ elif len(comp[1]) == 6 and len(comp) == 3:
+ dataset_id = comp[1]
+ file_id = comp[2]
+ else:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ if len(dataset_id) != 6:
+ raise ValueError(f"expected a 6-digit dataverse dataset id: {dataset_id}")
+ if file_id and len(file_id) != 6:
+ raise ValueError(f"expected a 6-digit dataverse file id: {file_id}")
+
+ return {
+ "type": id_type,
+ "authority": authority,
+ "shoulder": shoulder,
+ "dataset_id": dataset_id,
+ "file_id": file_id,
+ }
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # TODO: could also do HTML platform detection or something?
+
+ components = urllib.parse.urlparse(url)
+ # platform_domain = components.netloc.split(':')[0].lower()
+ params = urllib.parse.parse_qs(components.query)
+ id_param = params.get("persistentId")
+ if not id_param:
+ return False
+ platform_id = id_param[0]
+
+ try:
+ self.parse_dataverse_persistentid(platform_id)
+ except ValueError:
+ return False
+
+ return True
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+
+
+ HTTP GET https://demo.dataverse.org/api/datasets/export?exporter=dataverse_json&persistentId=doi:10.5072/FK2/J8SJZB
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # 1. extract domain, PID, and version from URL
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+ params = urllib.parse.parse_qs(components.query)
+ id_param = params.get("persistentId")
+ if not (id_param and id_param[0]):
+ raise PlatformScopeError("Expected a Dataverse persistentId in URL")
+ platform_id = id_param[0]
+ version_param = params.get("version")
+ dataset_version = None
+ if version_param:
+ dataset_version = version_param[0]
+
+ try:
+ parsed_id = self.parse_dataverse_persistentid(platform_id)
+ except ValueError:
+ raise PlatformScopeError("not actually in scope")
+
+ if parsed_id["file_id"]:
+ # TODO: maybe we could support this?
+ raise PlatformScopeError(
+ "only entire dataverse datasets can be archived with this tool"
+ )
+
+ # 1b. if we didn't get a version number from URL, fetch it from API
+ if not dataset_version:
+ resp = self.session.get(
+ f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}",
+ timeout=60.0,
+ )
+ resp.raise_for_status()
+ obj = resp.json()
+ if "latestVersion" not in obj["data"]:
+ raise PlatformScopeError("could not find latest version for dataverse record")
+ obj_latest = obj["data"]["latestVersion"]
+ dataset_version = (
+ f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
+ )
+
+ # 2. API fetch
+ resp = self.session.get(
+ f"https://{platform_domain}/api/datasets/:persistentId/?persistentId={platform_id}&version={dataset_version}",
+ timeout=60.0,
+ )
+ resp.raise_for_status()
+ obj = resp.json()
+
+ obj_latest = obj["data"]["latestVersion"]
+ assert (
+ dataset_version
+ == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}"
+ )
+ assert platform_id == obj_latest["datasetPersistentId"]
+
+ manifest = []
+ for row in obj_latest["files"]:
+ df = row["dataFile"]
+ df_persistent_id = df["persistentId"]
+ platform_url = f"https://{platform_domain}/api/access/datafile/:persistentId/?persistentId={df_persistent_id}"
+ if df.get("originalFileName"):
+ platform_url += "&format=original"
+
+ extra = dict()
+ # TODO: always save the version field?
+ if row.get("version") != 1:
+ extra["version"] = row["version"]
+ if "description" in df:
+ extra["description"] = df["description"]
+ manifest.append(
+ FilesetManifestFile(
+ path=df.get("originalFileName") or df["filename"],
+ size=df.get("originalFileSize") or df["filesize"],
+ md5=df["md5"],
+ # NOTE: don't get: sha1, sha256
+ mimetype=df["contentType"],
+ platform_url=platform_url,
+ extra=extra or None,
+ )
+ )
+
+ platform_sub_id = platform_id.split("/")[-1]
+ archiveorg_item_name = f"{platform_domain}-{platform_sub_id}-v{dataset_version}"
+ archiveorg_item_meta = dict(
+ # TODO: collection=platform_domain,
+ collection="datasets",
+ date=obj_latest["releaseTime"].split("T")[0],
+ source=f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}",
+ )
+ if platform_id.startswith("doi:10."):
+ archiveorg_item_meta["doi"] = platform_id.replace("doi:", "")
+ for block in obj_latest["metadataBlocks"]["citation"]["fields"]:
+ if block["typeName"] == "title":
+ archiveorg_item_meta["title"] = block["value"]
+ elif block["typeName"] == "depositor":
+ archiveorg_item_meta["creator"] = block["value"]
+ elif block["typeName"] == "dsDescription":
+ archiveorg_item_meta["description"] = block["value"][0]["dsDescriptionValue"][
+ "value"
+ ]
+
+ archiveorg_item_meta["description"] = archiveorg_item_meta.get("description", "")
+ if obj_latest.get("termsOfUse"):
+ archiveorg_item_meta["description"] += "\n<br>\n" + obj_latest["termsOfUse"]
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ web_bundle_url=f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=dataset_version),
+ )
+
+
+def test_parse_dataverse_persistentid() -> None:
+
+ valid = {
+ "doi:10.25625/LL6WXZ": {
+ "type": "doi",
+ "authority": "10.25625",
+ "shoulder": None,
+ "dataset_id": "LL6WXZ",
+ "file_id": None,
+ },
+ "doi:10.5072/FK2/J8SJZB": {
+ "type": "doi",
+ "authority": "10.5072",
+ "shoulder": "FK2",
+ "dataset_id": "J8SJZB",
+ "file_id": None,
+ },
+ "doi:10.5072/FK2/J8SJZB/LL6WXZ": {
+ "type": "doi",
+ "authority": "10.5072",
+ "shoulder": "FK2",
+ "dataset_id": "J8SJZB",
+ "file_id": "LL6WXZ",
+ },
+ "hdl:20.500.12690/RIN/IDDOAH/BTNH25": {
+ "type": "hdl",
+ "authority": "20.500.12690",
+ "shoulder": "RIN",
+ "dataset_id": "IDDOAH",
+ "file_id": "BTNH25",
+ },
+ "doi:10.7910/DVN/6HPRIG": {
+ "type": "doi",
+ "authority": "10.7910",
+ "shoulder": "DVN",
+ "dataset_id": "6HPRIG",
+ "file_id": None,
+ },
+ }
+
+ invalid = [
+ # "doi:10.5072/FK2/J8SJZB/LL6WXZ",
+ "doi:10.25625/abcd",
+ "other:10.25625/LL6WXZ",
+ "10.25625/LL6WXZ",
+ "doi:10.5072/FK2/J8SJZB/LL6WXZv123",
+ ]
+
+ for pid, val in valid.items():
+ assert DataverseHelper.parse_dataverse_persistentid(pid) == val
+
+ for pid in invalid:
+ try:
+ DataverseHelper.parse_dataverse_persistentid(pid)
+ assert False, "should not get here"
+ except ValueError:
+ pass
+
+
+class FigshareHelper(FilesetPlatformHelper):
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "figshare"
+ self.session = requests_retry_session()
+
+ @staticmethod
+ def parse_figshare_url_path(path: str) -> Tuple[str, Optional[str]]:
+ """
+ Tries to parse a figshare URL into ID number and (optional) version number.
+
+ Returns a two-element tuple; version number will be None if not found
+
+ Raises a ValueError if not a figshare URL
+ """
+ # eg: /articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1
+ # /articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4
+
+ comp = path.split("/")
+ if len(comp) < 4 or comp[1] != "articles":
+ raise ValueError(f"not a figshare URL: {path}")
+
+ comp = comp[2:]
+ if comp[0] in [
+ "dataset",
+ # TODO: should the following be considered "out of scope"?
+ "journal_contribution",
+ "presentation",
+ "poster",
+ "thesis",
+ ]:
+ comp = comp[1:]
+
+ if len(comp) == 3 and comp[1].isdigit() and comp[2].isdigit():
+ return (comp[1], comp[2])
+ elif len(comp) == 2 and comp[1].isdigit():
+ return (comp[1], None)
+ else:
+ raise ValueError(f"couldn't find figshare identiier: {path}")
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+
+ # only work with full, versioned figshare.com URLs
+ if "figshare.com" not in platform_domain:
+ return False
+
+ try:
+ parsed = self.parse_figshare_url_path(components.path)
+ except ValueError:
+ return False
+
+ # has file component
+ if parsed[0] and parsed[1]:
+ return True
+
+ return False
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # 1. extract domain, PID, and version from URL
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+
+ (platform_id, dataset_version) = self.parse_figshare_url_path(components.path)
+ assert platform_id.isdigit(), f"expected numeric: {platform_id}"
+ assert (
+ dataset_version and dataset_version.isdigit()
+ ), f"expected numeric: {dataset_version}"
+
+ # 1b. if we didn't get a version number from URL, fetch it from API
+ # TODO: implement this code path
+
+ # 2. API fetch
+ resp = self.session.get(
+ f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}",
+ timeout=60.0,
+ )
+ resp.raise_for_status()
+ obj = resp.json()
+
+ # figshare_type = obj['defined_type_name']
+
+ if not obj["is_public"]:
+ raise PlatformRestrictedError(f"record not public: {platform_id} {dataset_version}")
+ if obj["is_embargoed"]:
+ raise PlatformRestrictedError(
+ f'record is embargoed: {obj.get("embargo_title")} ({platform_id} {dataset_version})'
+ )
+
+ manifest = []
+ for row in obj["files"]:
+ manifest.append(
+ FilesetManifestFile(
+ path=row["name"],
+ size=row["size"],
+ md5=row["computed_md5"],
+ # NOTE: don't get: sha1, sha256, mimetype
+ platform_url=row["download_url"],
+ # extra=dict(),
+ )
+ )
+ if row.get("is_link_only"):
+ raise PlatformScopeError(
+ f"figshare.org file is just a link (not a file): {row['name']} at {row['download_url']}"
+ )
+
+ authors = []
+ for author in obj["authors"]:
+ authors.append(author["full_name"])
+ archiveorg_item_name = f"{platform_domain}-{platform_id}-v{dataset_version}"
+ archiveorg_item_meta = dict(
+ # TODO: collection=platform_domain,
+ collection="datasets",
+ creator=authors,
+ doi=obj["doi"],
+ title=obj["title"],
+ date=obj["published_date"],
+ source=obj["url_public_html"],
+ description=obj["description"],
+ license=obj["license"]["url"],
+ version=obj["version"],
+ )
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=dataset_version),
+ )
+
+
+def test_parse_figshare_url_path() -> None:
+
+ valid = {
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": (
+ "8987858",
+ "1",
+ ),
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": (
+ "8987858",
+ None,
+ ),
+ "/articles/CIBERSORT_p-value_0_05/8217188/1": ("8217188", "1"),
+ "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": (
+ "12127176",
+ "4",
+ ),
+ "/articles/journal_contribution/Improved_Time_Resolved_Measurements_of_Inorganic_Ions_in_Particulate_Matter_by_PILS_IC_Integrated_with_a_Sample_Pre_Concentration_System/1407386/3": (
+ "1407386",
+ "3",
+ ),
+ "/articles/poster/Effect_of_nanoclay_loading_on_the_thermal_decomposition_of_nanoclay_polyurethane_elastomers_obtained_by_bulk_polymerization/1094056/1": (
+ "1094056",
+ "1",
+ ),
+ }
+
+ invalid = [
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species",
+ ]
+
+ for path, val in valid.items():
+ assert FigshareHelper.parse_figshare_url_path(path) == val
+
+ for path in invalid:
+ try:
+ FigshareHelper.parse_figshare_url_path(path)
+ assert False, "should not get here"
+ except ValueError:
+ pass
+
+
+class ZenodoHelper(FilesetPlatformHelper):
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "zenodo"
+ self.session = requests_retry_session()
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+ if platform_domain == "zenodo.org" and "/record/" in components.path:
+ return True
+ return False
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+
+ # TODO: also look in base_url and resource-non-terminal for ident? to
+ # check for work-level redirects
+
+ # 1. extract identifier from URL
+ # eg: https://zenodo.org/record/5230255
+ components = urllib.parse.urlparse(url)
+ platform_domain = components.netloc.split(":")[0].lower()
+ if len(components.path.split("/")) < 2:
+ raise PlatformScopeError("Expected a complete, versioned figshare URL")
+
+ platform_id = components.path.split("/")[2]
+ assert platform_id.isdigit(), f"expected numeric: {platform_id}"
+
+ if "zenodo.org" not in platform_domain:
+ raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}")
+
+ # 2. API fetch
+ resp = self.session.get(f"https://zenodo.org/api/records/{platform_id}", timeout=60.0)
+ if resp.status_code == 410:
+ raise PlatformRestrictedError("record deleted")
+ resp.raise_for_status()
+ obj = resp.json()
+
+ assert obj["id"] == int(platform_id)
+ work_id = obj["conceptrecid"]
+ if work_id == obj["id"]:
+ raise PlatformScopeError(
+ "got a work-level zenodo record, not a versioned record: {work_id}"
+ )
+
+ # zenodo_type = obj['metadata']['resource_type']['type']
+
+ if obj["metadata"]["access_right"] != "open":
+ raise PlatformRestrictedError(
+ "not publicly available ({obj['metadata']['access_right']}): {platform_domain} {platform_id}"
+ )
+
+ manifest = []
+ for row in obj["files"]:
+ mf = FilesetManifestFile(
+ path=row["key"],
+ size=row["size"],
+ platform_url=row["links"]["self"],
+ # extra=dict(),
+ )
+ checksum = row["checksum"]
+ # eg: md5:35ffcab905f8224556dba76648cb7dad
+ if checksum.startswith("md5:"):
+ mf.md5 = checksum[4:]
+ elif checksum.startswith("sha1:"):
+ mf.sha1 = checksum[45]
+ manifest.append(mf)
+
+ authors = []
+ for author in obj["metadata"]["creators"]:
+ authors.append(author["name"])
+ archiveorg_item_name = f"{platform_domain}-{platform_id}"
+ archiveorg_item_meta = dict(
+ # TODO: collection=platform_domain,
+ collection="datasets",
+ creator=authors,
+ doi=obj["doi"],
+ title=obj["metadata"]["title"],
+ date=obj["metadata"]["publication_date"],
+ source=obj["links"]["html"],
+ description=obj["metadata"]["description"],
+ license=obj["metadata"]["license"]["id"],
+ version=obj["revision"],
+ # obj['metadata']['version'] is, eg, git version tag
+ )
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain=platform_domain,
+ platform_id=platform_id,
+ archiveorg_item_name=archiveorg_item_name,
+ archiveorg_item_meta=archiveorg_item_meta,
+ # web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}",
+ # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files)
+ extra=dict(version=obj["revision"]),
+ )
+
+
+class ArchiveOrgHelper(FilesetPlatformHelper):
+
+ FORMAT_TO_MIMETYPE = {
+ "BZIP": "application/x-bzip",
+ "BZIP2": "application/x-bzip2",
+ "ZIP": "application/zip",
+ "GZIP": "application/gzip",
+ "RAR": "application/vnd.rar",
+ "TAR": "application/x-tar",
+ "7z": "application/x-7z-compressed",
+ "HTML": "text/html",
+ "Text": "text/plain",
+ "PDF": "application/pdf",
+ "CSV": "text/csv",
+ "XML": "application/xml",
+ "JSON": "application/json",
+ #'application/msword (.doc)', # .doc
+ #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
+ #'application/vnd.ms-excel', # .xls
+ #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
+ "MP3": "audio/mpeg", # .mp3
+ "MP4": "video/mp4", # .mp4
+ "MPEG": "video/mpeg", # .mpeg
+ "JPEG": "image/jpeg",
+ "GIF": "image/gif",
+ "PNG": "image/png",
+ "TIFF": "image/tiff",
+ "Unknown": None,
+ }
+
+ def __init__(self):
+ super().__init__()
+ self.platform_name = "archiveorg"
+ self.session = internetarchive.get_session()
+
+ @staticmethod
+ def want_item_file(f: internetarchive.File, item_name: str) -> bool:
+ """
+ Filters IA API files
+ """
+ if f.source != "original":
+ return False
+ for suffix in [
+ "_meta.sqlite",
+ "_archive.torrent",
+ "_itemimage.jpg",
+ "_meta.xml",
+ "_thumb.png",
+ "_files.xml",
+ ]:
+ if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
+ return False
+ if f.name.startswith("_"):
+ return False
+ if item_name.startswith("academictorrents_"):
+ for suffix in [
+ "_academictorrents.torrent",
+ "_academictorrents_torrent.txt",
+ ".bib",
+ ]:
+ if f.name == item_name + suffix:
+ return False
+ return True
+
+ def match_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request["base_url"]
+ patterns = [
+ "://archive.org/details/",
+ "://archive.org/download/",
+ ]
+ for p in patterns:
+ if p in url:
+ return True
+ return False
+
+ def process_request(
+ self,
+ request: dict,
+ resource: Optional[ResourceResult],
+ html_biblio: Optional[BiblioMetadata],
+ ) -> FilesetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+
+ base_url_split = request["base_url"].split("/")
+ # print(base_url_split, file=sys.stderr)
+ assert len(base_url_split) in [5, 6]
+ assert base_url_split[0] in ["http:", "https:"]
+ assert base_url_split[2] == "archive.org"
+ assert base_url_split[3] in ["details", "download"]
+ item_name = base_url_split[4]
+ if len(base_url_split) == 6 and base_url_split[5]:
+ raise PlatformScopeError(
+ "got an archive.org file path, not download/details page; individual files not handled yet"
+ )
+
+ # print(f" archiveorg processing item={item_name}", file=sys.stderr)
+ item = self.session.get_item(item_name)
+ item_name = item.identifier
+ item_collection = item.metadata["collection"]
+ if type(item_collection) == list:
+ item_collection = item_collection[0]
+ assert item.metadata["mediatype"] not in ["collection", "web"]
+ item_files = item.get_files(on_the_fly=False)
+ item_files = [f for f in item_files if self.want_item_file(f, item_name)]
+ manifest = []
+ for f in item_files:
+ assert f.name and f.sha1 and f.md5
+ assert f.name is not None
+ mf = FilesetManifestFile(
+ path=f.name,
+ size=int(f.size),
+ sha1=f.sha1,
+ md5=f.md5,
+ mimetype=self.FORMAT_TO_MIMETYPE[f.format],
+ platform_url=f"https://archive.org/download/{item_name}/{f.name}",
+ )
+ manifest.append(mf)
+
+ return FilesetPlatformItem(
+ platform_name=self.platform_name,
+ platform_status="success",
+ manifest=manifest,
+ platform_domain="archive.org",
+ platform_id=item_name,
+ archiveorg_item_name=item_name,
+ archiveorg_meta=dict(collection=item_collection),
+ )
+
+ def chose_strategy(self, item: FilesetPlatformItem) -> IngestStrategy:
+ """
+ Don't use default strategy picker; we are always doing an 'existing' in this case.
+ """
+ assert item.manifest is not None
+ if len(item.manifest) == 1:
+ # NOTE: code flow does not support ArchiveorgFilesetBundle for the
+ # case of, eg, a single zipfile in an archive.org item
+ return IngestStrategy.ArchiveorgFile
+ elif len(item.manifest) >= 1:
+ return IngestStrategy.ArchiveorgFileset
+ else:
+ raise NotImplementedError("empty dataset")
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
new file mode 100644
index 0000000..1d84ce5
--- /dev/null
+++ b/python/sandcrawler/fileset_strategies.py
@@ -0,0 +1,387 @@
+import os
+import shutil
+import sys
+from typing import Optional
+
+import internetarchive
+import requests
+
+from sandcrawler.fileset_types import (
+ ArchiveStrategyResult,
+ FilesetPlatformItem,
+ IngestStrategy,
+ PlatformScopeError,
+)
+from sandcrawler.ia import SavePageNowClient, WaybackClient, fix_transfer_encoding
+from sandcrawler.misc import (
+ gen_file_metadata,
+ gen_file_metadata_path,
+ requests_retry_session,
+ sanitize_fs_path,
+)
+
+
+class FilesetIngestStrategy:
+ def __init__(self):
+ # self.ingest_strategy = 'unknown'
+ self.success_status = "success"
+
+ def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
+ raise NotImplementedError()
+
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
+ raise NotImplementedError()
+
+
+class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.ingest_strategy = IngestStrategy.ArchiveorgFileset
+
+ # TODO: enable cleanup when confident (eg, safe path parsing)
+ self.skip_cleanup_local_files = kwargs.get("skip_cleanup_local_files", True)
+ self.working_dir = os.environ.get("SANDCRAWLER_WORKING_DIR", "/tmp/sandcrawler/")
+ try:
+ os.mkdir(self.working_dir)
+ except FileExistsError:
+ pass
+
+ self.http_session = requests_retry_session()
+ self.ia_session = internetarchive.get_session(
+ config={
+ "s3": {
+ "access": os.environ.get("IA_ACCESS_KEY"),
+ "secret": os.environ.get("IA_SECRET_KEY"),
+ },
+ }
+ )
+
+ def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]:
+ """
+ use API to check for item with all the files in the manifest
+
+ NOTE: this naive comparison is quadratic in number of files, aka O(N^2)
+ """
+ ia_item = self.ia_session.get_item(item.archiveorg_item_name)
+ if not ia_item.exists:
+ return None
+ item_files = ia_item.get_files(on_the_fly=False)
+ assert item.manifest
+ for wanted in item.manifest:
+ found = False
+ for existing in item_files:
+ if existing.name == wanted.path:
+ if (
+ (
+ (existing.sha1 and existing.sha1 == wanted.sha1)
+ or (existing.md5 and existing.md5 == wanted.md5)
+ )
+ and existing.name == wanted.path
+ and existing.size == wanted.size
+ ):
+ found = True
+ wanted.status = "exists"
+ break
+ else:
+ wanted.status = "mismatch-existing"
+ break
+ if not found:
+ print(
+ f" item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}",
+ file=sys.stderr,
+ )
+ return None
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status="success-existing",
+ manifest=item.manifest,
+ )
+
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
+ """
+ May require extra context to pass along to archive.org item creation.
+ """
+ existing = self.check_existing(item)
+ if existing:
+ return existing
+
+ if item.platform_name == "archiveorg":
+ raise PlatformScopeError("shouldn't download archive.org into itself")
+
+ local_dir = self.working_dir + item.archiveorg_item_name
+ assert local_dir.startswith("/")
+ assert local_dir.count("/") > 2
+ try:
+ os.mkdir(local_dir)
+ except FileExistsError:
+ pass
+
+ # 1. download all files locally
+ assert item.manifest
+ for m in item.manifest:
+ if m.path != sanitize_fs_path(m.path):
+ m.status = "unsafe-path"
+ continue
+
+ local_path = local_dir + "/" + m.path
+ assert m.platform_url
+
+ if not os.path.exists(os.path.dirname(local_path)):
+ os.mkdir(os.path.dirname(local_path))
+ if os.path.exists(local_path):
+ m.status = "exists-local"
+ else:
+ print(f" downloading {m.path}", file=sys.stderr)
+ # create any sub-directories for this path, if necessary
+ if not os.path.exists(os.path.dirname(local_path)):
+ os.mkdir(os.path.dirname(local_path))
+ try:
+ with self.http_session.get(
+ m.platform_url,
+ stream=True,
+ allow_redirects=True,
+ timeout=2 * 60 * 60,
+ ) as r:
+ r.raise_for_status()
+ with open(local_path + ".partial", "wb") as f:
+ for chunk in r.iter_content(chunk_size=256 * 1024):
+ f.write(chunk)
+ os.rename(local_path + ".partial", local_path)
+ m.status = "downloaded-local"
+ except requests.exceptions.RequestException:
+ m.status = "error-platform-download"
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status="error-platform-download",
+ )
+
+ print(f" verifying {m.path}", file=sys.stderr)
+ file_meta = gen_file_metadata_path(local_path, allow_empty=True)
+ if file_meta["size_bytes"] != m.size:
+ print(f" expected: {m.size} found: {file_meta['size_bytes']}", file=sys.stderr)
+ m.status = "mismatch-size"
+ continue
+
+ if m.sha1:
+ if file_meta["sha1hex"] != m.sha1:
+ m.status = "mismatch-sha1"
+ continue
+ else:
+ m.sha1 = file_meta["sha1hex"]
+
+ if m.sha256:
+ if file_meta["sha256hex"] != m.sha256:
+ m.status = "mismatch-sha256"
+ continue
+ else:
+ m.sha256 = file_meta["sha256hex"]
+
+ if m.md5:
+ if file_meta["md5hex"] != m.md5:
+ m.status = "mismatch-md5"
+ continue
+ else:
+ m.md5 = file_meta["md5hex"]
+
+ if m.mimetype:
+ # 'magic' isn't good and parsing more detailed text file formats like text/csv
+ if (
+ file_meta["mimetype"] != m.mimetype
+ and file_meta["mimetype"] != "text/plain"
+ ):
+ # these 'tab-separated-values' from dataverse are just noise, don't log them
+ if m.mimetype != "text/tab-separated-values":
+ print(
+ f" WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}",
+ file=sys.stderr,
+ )
+ m.mimetype = file_meta["mimetype"]
+ else:
+ m.mimetype = file_meta["mimetype"]
+ m.status = "verified-local"
+
+ # if verification failed for any individual files, bail out
+ for m in item.manifest:
+ if m.status != "verified-local":
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status=m.status,
+ )
+
+ # 2. upload all files, with metadata
+ assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"]
+ item_files = {}
+ for m in item.manifest:
+ local_path = local_dir + "/" + m.path
+ if m.path == "name":
+ raise NotImplementedError(
+ "fileset file path is 'name', which is a reserved keyword"
+ )
+ item_files[m.path] = local_path
+ if len(item_files) != len(item.manifest):
+ raise NotImplementedError("file/manifest length mismatch: duplicated file paths?")
+
+ print(
+ f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...",
+ file=sys.stderr,
+ )
+ try:
+ internetarchive.upload(
+ item.archiveorg_item_name,
+ files=item_files,
+ metadata=item.archiveorg_item_meta,
+ checksum=True,
+ queue_derive=False,
+ verify=True,
+ )
+ except requests.exceptions.RequestException:
+ return ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ manifest=item.manifest,
+ status="error-archiveorg-upload",
+ )
+
+ for m in item.manifest:
+ m.status = "success"
+
+ # 4. delete local directory
+ if not self.skip_cleanup_local_files:
+ shutil.rmtree(local_dir)
+
+ result = ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status=self.success_status,
+ manifest=item.manifest,
+ )
+
+ return result
+
+
+class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy):
+ """
+ ArchiveorgFilesetStrategy currently works fine with individual files. Just
+ need to over-ride the ingest_strategy name.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.ingest_strategy = IngestStrategy.ArchiveorgFileset
+ self.success_status = "success-file"
+
+
+class WebFilesetStrategy(FilesetIngestStrategy):
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.ingest_strategy = IngestStrategy.WebFileset
+ self.wayback_client = WaybackClient()
+ self.try_spn2 = kwargs.get("try_spn2", True)
+ self.spn_client = SavePageNowClient(
+ spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0)
+ )
+ self.max_spn_manifest = 20
+
+ def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
+ """
+ For each manifest item individually, run 'fetch_resource' and record stats, terminal_url, terminal_dt
+
+ TODO:
+ - full fetch_resource() method which can do SPN requests
+ """
+
+ assert item.manifest
+ file_file_meta = None
+ file_resource = None
+ for m in item.manifest:
+ fetch_url = m.platform_url
+ if not fetch_url:
+ raise NotImplementedError(
+ "require 'platform_url' for each file when doing Web fetching"
+ )
+
+ via = "wayback"
+ resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
+
+ if self.try_spn2 and (
+ resource is None or (resource and resource.status == "no-capture")
+ ):
+ if len(item.manifest) > self.max_spn_manifest:
+ m.status = "too-much-spn"
+ continue
+ via = "spn2"
+ resource = self.spn_client.crawl_resource(
+ fetch_url, self.wayback_client, force_simple_get=True
+ )
+
+ print(
+ "[FETCH {:>6}] {} {}".format(
+ via,
+ (resource and resource.status),
+ (resource and resource.terminal_url) or fetch_url,
+ ),
+ file=sys.stderr,
+ )
+
+ m.terminal_url = resource.terminal_url
+ m.terminal_dt = resource.terminal_dt
+ m.status = resource.status
+ if self.ingest_strategy == "web-file":
+ file_resource = resource
+
+ if resource.status != "success":
+ continue
+ else:
+ assert resource.terminal_status_code == 200
+
+ if not resource.body:
+ m.status = "empty-blob"
+ continue
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, _html_resource = fix_transfer_encoding(file_meta, resource)
+ except Exception:
+ m.status = "transfer-encoding-error"
+ continue
+
+ if self.ingest_strategy == "web-file":
+ file_file_meta = file_meta
+
+ if (
+ file_meta["size_bytes"] != m.size
+ or (m.md5 and m.md5 != file_meta["md5hex"])
+ or (m.sha1 and m.sha1 != file_meta["sha1hex"])
+ ):
+ m.status = "mismatch"
+ continue
+
+ m.md5 = m.md5 or file_meta["md5hex"]
+ m.sha1 = m.sha1 or file_meta["sha1hex"]
+ m.sha256 = m.sha256 or file_meta["sha256hex"]
+ m.mimetype = m.mimetype or file_meta["mimetype"]
+
+ overall_status = self.success_status
+ for m in item.manifest:
+ if m.status != "success":
+ overall_status = m.status or "not-processed"
+ break
+ if not item.manifest:
+ overall_status = "empty-manifest"
+
+ result = ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status=overall_status,
+ manifest=item.manifest,
+ )
+ if self.ingest_strategy == "web-file":
+ result.file_file_meta = file_file_meta
+ result.file_resource = file_resource
+ return result
+
+
+class WebFileStrategy(WebFilesetStrategy):
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.ingest_strategy = IngestStrategy.WebFile
+ self.success_status = "success-file"
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
new file mode 100644
index 0000000..3398833
--- /dev/null
+++ b/python/sandcrawler/fileset_types.py
@@ -0,0 +1,74 @@
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel
+
+
+class IngestStrategy(str, Enum):
+ WebFile = "web-file"
+ WebFileset = "web-fileset"
+ WebFilesetBundled = "web-fileset-bundled"
+ ArchiveorgFile = "archiveorg-file"
+ ArchiveorgFileset = "archiveorg-fileset"
+ ArchiveorgFilesetBundled = "archiveorg-fileset-bundled"
+
+
+class FilesetManifestFile(BaseModel):
+ path: str
+ size: Optional[int]
+ md5: Optional[str]
+ sha1: Optional[str]
+ sha256: Optional[str]
+ mimetype: Optional[str]
+ extra: Optional[Dict[str, Any]]
+
+ status: Optional[str]
+ platform_url: Optional[str]
+ terminal_url: Optional[str]
+ terminal_dt: Optional[str]
+
+
+class FilesetPlatformItem(BaseModel):
+ platform_name: str
+ platform_status: str
+ platform_domain: Optional[str]
+ platform_id: Optional[str]
+ manifest: Optional[List[FilesetManifestFile]]
+
+ archiveorg_item_name: Optional[str]
+ archiveorg_item_meta: Optional[dict]
+ web_base_url: Optional[str]
+ web_bundle_url: Optional[str]
+
+
+class ArchiveStrategyResult(BaseModel):
+ ingest_strategy: str
+ status: str
+ manifest: List[FilesetManifestFile]
+ file_file_meta: Optional[Dict[str, Any]]
+ file_resource: Optional[Any]
+ bundle_file_meta: Optional[Dict[str, Any]]
+ bundle_resource: Optional[Any]
+ bundle_archiveorg_path: Optional[str]
+
+
+class PlatformScopeError(Exception):
+ """
+ For incidents where platform helper discovers that the fileset/dataset is
+ out-of-cope after already starting to process it.
+
+ For example, attempting to ingest:
+
+ - a 'latest version' record, when the platform has version-specific records
+ - a single file within a dataset for a platform which has file-level identifiers
+ """
+
+ pass
+
+
+class PlatformRestrictedError(Exception):
+ """
+ When datasets are not publicly available on a platform (yet)
+ """
+
+ pass
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index f157241..aa2c112 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,116 +1,376 @@
+import html
+import sys
+import time
+import xml.etree.ElementTree
+from typing import Any, Dict, List, Optional
import requests
-from collections import Counter
+from grobid_tei_xml import GrobidBiblio, parse_citation_list_xml, parse_document_xml
-from .workers import SandcrawlerWorker
-from .misc import gen_file_metadata
-from .ia import WaybackClient, WaybackError
+from .ia import WaybackClient
+from .misc import gen_file_metadata, requests_retry_session
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
-class GrobidClient(object):
+MAX_GROBID_BLOB_SIZE: int = 256 * 1024 * 1024 # ~256 MByte
+
+
+def clean_crossref_unstructured(raw: str) -> str:
+ """
+ Applies Crossref-specific cleanups to an 'unstructured' citation string.
+ """
+
+ # detect repeated strings with double space separating them
+ subs = raw.split(" ")
+ if len(subs) == 2 and subs[0] == subs[1]:
+ raw = subs[0]
+ else:
+ raw = " ".join(subs)
+
+ # remove HTML/XML numeric characters
+ if "&#" in raw or "&amp;" in raw or "&gt;" in raw or "&lt;" in raw:
+ raw = html.unescape(raw)
+
+ raw.replace(" ", " ")
+ raw = raw.strip()
+ return raw
+
+
+def test_clean_ref_str() -> None:
+ # NOTE: this as emdash, non-breaking string characters in it
+ raw_with_nbsp = """Qingyao Ai Keping Bi Cheng Luo Jiafeng Guo and W.\u00a0Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. (2018) 385\u2013394. Qingyao Ai Keping Bi Cheng Luo Jiafeng Guo and W.\u00a0Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. (2018) 385\u2013394."""
+ cleaned = """Qingyao Ai Keping Bi Cheng Luo Jiafeng Guo and W.\u00a0Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. (2018) 385\u2013394."""
+ assert clean_crossref_unstructured(raw_with_nbsp) == cleaned
+
+ # HTML escape characters
+ assert (
+ clean_crossref_unstructured(
+ "J-B Champion, C.Collin, INSEE Premi&#232;re N&#176;1710 september 2018 - National Institute of Statistics and Economic Studies"
+ )
+ == "J-B Champion, C.Collin, INSEE Première N°1710 september 2018 - National Institute of Statistics and Economic Studies"
+ )
+
+ # simple doubling
+ assert (
+ clean_crossref_unstructured("https://graph500.org/. https://graph500.org/.")
+ == "https://graph500.org/."
+ )
+ assert (
+ clean_crossref_unstructured(
+ """Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg. Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg."""
+ )
+ == """Ronald L. Rivest and Butler W. Lampson. 1996. SDSI: A Simple Distributed Security Infrastructure. In Advances in Cryptology — CRYPTO ’96. Springer Berlin Heidelberg."""
+ )
+
+ # all non-breaking whitespace
+ assert (
+ clean_crossref_unstructured(
+ "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
+ )
+ == ""
+ )
- def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs):
+
+class GrobidClient(object):
+ def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
- self.consolidate_mode = int(kwargs.get('consolidate_mode', 2))
+ self.consolidate_mode = int(kwargs.get("consolidate_mode", 0))
+ self.session = requests_retry_session()
- def process_fulltext(self, blob, consolidate_mode=None):
+ def process_fulltext(
+ self, blob: bytes, consolidate_mode: Optional[int] = None
+ ) -> Dict[str, Any]:
"""
Returns dict with keys:
- status_code
- status (slug)
- error_msg (if status == 'error')
- tei_xml (if status is 200)
-
- TODO: persist connection for performance?
"""
assert blob
- if consolidate_mode == None:
+ if len(blob) > MAX_GROBID_BLOB_SIZE:
+ return {
+ "status": "blob-too-large",
+ "error_msg": f"Not going to process very large file ({len(blob)} bytes)",
+ }
+
+ if consolidate_mode is None:
consolidate_mode = self.consolidate_mode
+ assert consolidate_mode is not None
- grobid_response = requests.post(
- self.host_url + "/api/processFulltextDocument",
- files={
- 'input': blob,
- 'consolidateHeader': self.consolidate_mode,
- 'consolidateCitations': 0, # too expensive for now
- 'includeRawCitations': 1,
+ try:
+ grobid_response = self.session.post(
+ self.host_url + "/api/processFulltextDocument",
+ files={
+ "input": blob,
+ },
+ data={
+ "consolidateHeader": consolidate_mode,
+ "consolidateCitations": 0, # too expensive for now
+ "includeRawCitations": 1,
+ "includeRawAffiliations": 1,
+ "teiCoordinates": ["ref", "figure", "persName", "formula", "biblStruct"],
+ "segmentSentences": 1,
+ },
+ timeout=180.0,
+ )
+ except requests.Timeout:
+ return {
+ "status": "error-timeout",
+ "status_code": -4, # heritrix3 "HTTP timeout" code
+ "error_msg": "GROBID request (HTTP POST) timeout",
}
- )
+ except requests.exceptions.ConnectionError as ce:
+ # intentionally raising this, so workers crash when GROBID
+ # unavailable. but do add a sleep to slow things down.
+ print(
+ "GROBID ConnectionError. sleeping as a slow-down before crashing",
+ file=sys.stderr,
+ )
+ time.sleep(5.0)
+ raise ce
- info = dict(
- status_code=grobid_response.status_code,
- )
+ info: Dict[str, Any] = dict(status_code=grobid_response.status_code)
if grobid_response.status_code == 200:
- info['status'] = 'success'
- info['tei_xml'] = grobid_response.text
+ info["status"] = "success"
+ info["tei_xml"] = grobid_response.text
+ if len(info["tei_xml"]) > 12000000:
+ # XML is larger than Kafka message size, and much larger than
+ # an article in general; bail out
+ info["status"] = "error"
+ info["error_msg"] = "response XML too large: {} bytes".format(
+ len(info["tei_xml"])
+ )
+ info.pop("tei_xml")
else:
# response.text is .content decoded as utf-8
- info['status'] = 'error'
- info['error_msg'] = grobid_response.text[:10000]
+ info["status"] = "error"
+ info["error_msg"] = grobid_response.text[:10000]
return info
-class GrobidWorker(SandcrawlerWorker):
+ def process_citation_list(self, unstructured_list: List[str]) -> List[GrobidBiblio]:
+ if not unstructured_list:
+ return []
+ if len(unstructured_list) > 5000:
+ raise ValueError("more than 5,000 references in a batch is just too much")
- def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs):
- super().__init__()
+ try:
+ grobid_response = self.session.post(
+ self.host_url + "/api/processCitationList",
+ data={
+ "citations": unstructured_list,
+ "consolidateCitations": 0,
+ "includeRawCitations": 1,
+ },
+ timeout=30.0,
+ )
+ except requests.Timeout as te:
+ # TODO: handle somehow?
+ raise te
+
+ grobid_response.raise_for_status()
+ return parse_citation_list_xml(grobid_response.text)
+
+ def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+ if result["status"] != "success":
+ return None
+ try:
+ tei_doc = parse_document_xml(result["tei_xml"])
+ except xml.etree.ElementTree.ParseError as pe:
+ result["status"] = "bad-grobid-xml"
+ return dict(error_msg=str(pe)[:1000])
+ tei_doc.remove_encumbered()
+ tei_json = tei_doc.to_legacy_dict()
+ meta = dict()
+ biblio = dict()
+ for k in (
+ "title",
+ "authors",
+ "journal",
+ "date",
+ "doi",
+ ):
+ if tei_json.get(k):
+ biblio[k] = tei_json[k]
+ meta["biblio"] = biblio
+ for k in ("grobid_version", "grobid_timestamp", "fatcat_release", "language_code"):
+ if tei_json.get(k):
+ meta[k] = tei_json[k]
+ return meta
+
+ def should_parse_crossref_ref(self, ref: Dict[str, Any]) -> bool:
+ """
+ Helper function to decide whether to run GROBID parsing on an crossref
+ reference.
+
+ For example, if there is already a DOI in the ref metadata, could skip.
+ Or, if there is sufficient structured metadata, or only depending on
+ the source of the DOI linkage.
+ """
+ if ref.get("DOI"):
+ return False
+ if len(ref.get("unstructured", "").strip()) <= 6:
+ return False
+
+ if (
+ ref.get("year")
+ and ref.get("author")
+ and (ref.get("article-title") or ref.get("series-title") or ref.get("volume-title"))
+ ):
+ return False
+ elif ref.get("year") and ref.get("author") and ref.get("journal-title"):
+ return False
+ elif ref.get("journal-title") and ref.get("volume") and ref.get("first-page"):
+ return False
+
+ return True
+
+ def crossref_refs(self, record: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Given a complete Crossref metadata record, inspects the
+
+ The returned dict is in the schema of the `grobid_refs` database table,
+ in dict form:
+
+ source: 'crossref'
+ source_id: doi, as lower-case string
+ source_ts: Crossref indexed timestamp, if available
+ ('updated' is not set)
+ refs_json: list of dicts
+ """
+
+ # remove API wrapper around record, if necessary
+ if "message" in record and "DOI" not in record:
+ record = record["message"]
+
+ ret = dict(
+ source="crossref",
+ source_id=record["DOI"].lower(),
+ source_ts=record["indexed"]["date-time"],
+ refs_json=[],
+ )
+ all_refs = record.get("reference", [])
+ unstructured_refs = []
+ for r in all_refs:
+ if not r.get("unstructured"):
+ continue
+ if not self.should_parse_crossref_ref(r):
+ continue
+ unstructured_refs.append(r)
+ if not unstructured_refs:
+ return ret
+
+ # some reasonable cap on length of refs per work
+ if len(unstructured_refs) > 2000:
+ print(
+ f"truncating very large reference list for doi:{record['DOI']} len:{len(unstructured_refs)}",
+ file=sys.stderr,
+ )
+ unstructured_refs = unstructured_refs[:2000]
+
+ clean_refs = [clean_crossref_unstructured(r["unstructured"]) for r in unstructured_refs]
+ refs = self.process_citation_list(clean_refs)
+
+ assert len(refs) == len(unstructured_refs)
+ refs_json = []
+ for i in range(len(refs)):
+ refs[i].id = unstructured_refs[i].get("key")
+ refs[i].index = None
+ refs_json.append(refs[i].to_dict())
+ ret["refs_json"] = refs_json
+ return ret
+
+
+class GrobidWorker(SandcrawlerFetchWorker):
+ def __init__(
+ self,
+ grobid_client: GrobidClient,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs,
+ ):
+ super().__init__(wayback_client=wayback_client)
self.grobid_client = grobid_client
- self.wayback_client = wayback_client
self.sink = sink
- self.consolidate_mode = 2
-
- def process(self, record):
- if record.get('warc_path') and record.get('warc_offset'):
- # it's a full CDX dict. fetch using WaybackClient
- if not self.wayback_client:
- raise Exception("wayback client not configured for this GrobidWorker")
- try:
- blob = self.wayback_client.fetch_warc_content(record['warc_path'],
- record['warc_offset'], record['warc_csize'])
- except WaybackError as we:
- return dict(status="error-wayback", error_msg=str(we), source=record)
- elif record.get('url') and record.get('datetime'):
- # it's a partial CDX dict or something? fetch using WaybackClient
- if not self.wayback_client:
- raise Exception("wayback client not configured for this GrobidWorker")
- try:
- blob = self.wayback_client.fetch_url_datetime(record['url'], record['datetime'])
- except WaybackError as we:
- return dict(status="error-wayback", error_msg=str(we), source=record)
- elif record.get('item') and record.get('path'):
- # it's petabox link; fetch via HTTP
- resp = requests.get("https://archive.org/serve/{}/{}".format(
- record['item'], record['path']))
- try:
- resp.raise_for_status()
- except Exception as e:
- return dict(status="error-petabox", error_msg=str(e), source=record)
- blob = resp.body
- else:
- raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
- if not blob:
- return dict(status="error", error_msg="empty blob", source=record)
- result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
- result['file_meta'] = gen_file_metadata(blob)
- result['source'] = record
- result['key'] = result['file_meta']['sha1hex']
+ self.consolidate_mode = 0
+
+ def timeout_response(self, task: Any) -> Any:
+ default_key = task["sha1hex"]
+ return dict(
+ status="error-timeout",
+ error_msg="internal GROBID worker timeout",
+ source=task,
+ key=default_key,
+ )
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ fetch_result = self.fetch_blob(record)
+ if fetch_result["status"] != "success":
+ return fetch_result
+ blob: bytes = fetch_result["blob"]
+ assert blob and isinstance(blob, bytes)
+
+ result = self.grobid_client.process_fulltext(
+ blob, consolidate_mode=self.consolidate_mode
+ )
+ result["file_meta"] = gen_file_metadata(blob)
+ result["source"] = record
+ result["key"] = result["file_meta"]["sha1hex"]
return result
+
+class CrossrefRefsWorker(SandcrawlerWorker):
+ def __init__(
+ self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.grobid_client = grobid_client
+ self.sink = sink
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ # handle the rare case of bad TEI-XML response
+ # eg: https://github.com/kermitt2/grobid/issues/848
+ try:
+ return self.grobid_client.crossref_refs(record)
+ except xml.etree.ElementTree.ParseError:
+ print(
+ f"GROBID returned bad XML for Crossref DOI: {record.get('DOI')}",
+ file=sys.stderr,
+ )
+ # but add a small slow-down so we don't churn through these if
+ # GROBID is just misconfigured or something
+ time.sleep(3)
+ return None
+ except requests.exceptions.HTTPError:
+ print(f"GROBID HTTP error for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
+ time.sleep(3)
+ return None
+ except requests.exceptions.ReadTimeout:
+ print(f"GROBID HTTP timeout for Crossref DOI: {record.get('DOI')}", file=sys.stderr)
+ time.sleep(3)
+ return None
+
+
class GrobidBlobWorker(SandcrawlerWorker):
"""
This is sort of like GrobidWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, grobid_client, sink=None, **kwargs):
+ def __init__(
+ self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+ ):
super().__init__()
self.grobid_client = grobid_client
self.sink = sink
- self.consolidate_mode = 2
+ self.consolidate_mode = 0
- def process(self, blob):
- assert blob
- result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
+ if not blob:
+ return None
+ result = self.grobid_client.process_fulltext(
+ blob, consolidate_mode=self.consolidate_mode
+ )
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
return result
-
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
new file mode 100644
index 0000000..207f067
--- /dev/null
+++ b/python/sandcrawler/html.py
@@ -0,0 +1,365 @@
+import json
+import re
+import sys
+import urllib.parse
+from typing import Any, Dict
+
+from bs4 import BeautifulSoup
+
+RESEARCHSQUARE_REGEX = re.compile(
+ r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"'
+)
+IEEEXPLORE_REGEX = re.compile(r'"pdfPath":"(/.*?\.pdf)"')
+OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";')
+SCIENCEDIRECT_BOUNCE_URL_REGEX = re.compile(r"window.location = '(http.*)';")
+
+
+def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
+ """
+ Takes an HTML document (and URL), assumed to be a landing page, and tries
+ to find a fulltext PDF url.
+
+ On error, or if fails to extract a URL, returns an empty dict.
+ """
+
+ host_prefix = "/".join(html_url.split("/")[:3])
+ try:
+ soup = BeautifulSoup(html_body, "html.parser")
+ except TypeError as te:
+ print(f"{te} (url={html_url})", file=sys.stderr)
+ return dict()
+ except UnboundLocalError as ule:
+ print(f"{ule} (url={html_url})", file=sys.stderr)
+ return dict()
+
+ # ignoring most type checks on bs4 output in this function (which is partially deprecated)
+ meta: Any
+ url: Any
+ redirect: Any
+
+ ### General Tricks ###
+ # note: most of these have migrated to the html_biblio code path
+
+ meta = soup.find("meta", attrs={"name": "generator"})
+ meta_generator = None
+ if meta and meta.get("content"):
+ meta_generator = meta["content"].strip()
+
+ ### Publisher/Platform Specific ###
+
+ # research square (researchsquare.com)
+ if "researchsquare.com/article/" in html_url:
+ # JSON in body with a field like:
+ # "url":"https://assets.researchsquare.com/files/4a57970e-b002-4608-b507-b95967649483/v2/Manuscript.pdf"
+ m = RESEARCHSQUARE_REGEX.search(html_body.decode("utf-8"))
+ if m:
+ url = m.group(1)
+ assert len(url) < 4096
+ return dict(release_stage="manuscript", pdf_url=url, technique="publisher")
+
+ # elseiver linking hub
+ # https://linkinghub.elsevier.com/retrieve/pii/S1569199319308975
+ if "://linkinghub.elsevier.com/retrieve/pii/" in html_url:
+ # <input type="hidden" name="redirectURL" value="http%3A%2F%2Fcysticfibrosisjournal.com%2Fretrieve%2Fpii%2FS1569199319308975" id="redirectURL"/>
+ redirect = soup.find("input", attrs={"name": "redirectURL"})
+ if redirect:
+ url = redirect["value"].strip()
+ if "http" in url:
+ url = urllib.parse.unquote(url)
+ # drop any the query parameter
+ url = url.split("?via")[0]
+ return dict(next_url=url, technique="elsevier-linkinghub")
+
+ # sciencedirect PDF URL extract
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670
+ if "sciencedirect.com/science/article/pii/" in html_url and not html_url.endswith(".pdf"):
+ json_tag: Any = soup.find(
+ "script", attrs={"type": "application/json", "data-iso-key": "_0"}
+ )
+ url = None
+ if json_tag:
+ try:
+ json_text = json_tag.string
+ json_meta = json.loads(json_text)
+ pdf_meta = json_meta["article"]["pdfDownload"]["urlMetadata"]
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
+ url = (
+ html_url
+ + pdf_meta["pdfExtension"]
+ + "?md5="
+ + pdf_meta["queryParams"]["md5"]
+ + "&pid="
+ + pdf_meta["queryParams"]["pid"]
+ )
+ except (KeyError, TypeError, json.JSONDecodeError):
+ pass
+ if url:
+ return dict(pdf_url=url, technique="sciencedirect-munge-json")
+
+ # sciencedirect PDF bounce page
+ # https://www.sciencedirect.com/science/article/pii/S2590109519300424/pdfft?md5=854f43a44de186eb58674b8e20631691&pid=1-s2.0-S2590109519300424-main.pdf
+ if "://www.sciencedirect.com/" in html_url and html_url.endswith(".pdf"):
+ # window.location = 'https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=[...]&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=[...]&hash=[...]&host=[...]&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=[...]&type=client';
+ m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(html_body.decode("utf-8"))
+ if m:
+ url = m.group(1)
+ assert len(url) < 4000
+ return dict(pdf_url=url, technique="sciencedirect-bounce")
+
+ # ieeexplore.ieee.org
+ # https://ieeexplore.ieee.org/document/8730316
+ if "://ieeexplore.ieee.org/document/" in html_url:
+ # JSON in body with a field like:
+ # "pdfPath":"/iel7/6287639/8600701/08730316.pdf",
+ m = IEEEXPLORE_REGEX.search(html_body.decode("utf-8"))
+ if m:
+ url = m.group(1)
+ assert len(url) < 4096
+ return dict(
+ release_stage="published", pdf_url=host_prefix + url, technique="ieeexplore"
+ )
+ # https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8730313
+ if "://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber" in html_url:
+ # HTML iframe like:
+ # <iframe src="http://web.archive.org/web/20191026011528if_/https://ieeexplore.ieee.org/ielx7/6287639/8600701/08730313.pdf?tp=&amp;arnumber=8730313&amp;isnumber=8600701&amp;ref=" frameborder="0"></iframe>
+ iframe: Any = soup.find("iframe")
+ if iframe and ".pdf" in iframe["src"]:
+ return dict(pdf_url=iframe["src"], technique="iframe")
+
+ # https://insights.ovid.com/crossref?an=00042307-202001000-00013
+ # Ovid is some kind of landing page bounce portal tracking run-around.
+ # Can extract actual journal URL from javascript blob in the HTML
+ if "://insights.ovid.com/crossref" in html_url:
+ # var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
+ m = OVID_JOURNAL_URL_REGEX.search(html_body.decode("utf-8"))
+ if m:
+ url = m.group(1)
+ assert len(url) < 4096
+ return dict(next_url=url, technique="ovid")
+
+ # osf.io
+ # https://osf.io/8phvx/
+ # https://osf.io/preprints/socarxiv/8phvx/
+ # wow, they ship total javascript crud! going to just guess download URL
+ # based on URL for now. Maybe content type header would help?
+ OSF_DOMAINS = [
+ "://osf.io/",
+ "://biohackrxiv.org/",
+ "://psyarxiv.com/",
+ "://arabixiv.org/",
+ "://engrxiv.org/",
+ "://edarxiv.org//",
+ "://ecsarxiv.org/",
+ "://ecoevorxiv.org/",
+ "://frenxiv.org/",
+ "://indiarxiv.org/",
+ "://mindrxiv.org/",
+ "://mediarxiv.org/",
+ "://paleorxiv.org/",
+ "://thesiscommons.org/",
+ ]
+ for domain in OSF_DOMAINS:
+ if (
+ domain in html_url
+ and (len(html_url.split("/")) in [4, 5] or "/preprints/" in html_url)
+ and "/download" not in html_url
+ ):
+ if not html_url.endswith("/"):
+ next_url = html_url + "/download"
+ else:
+ next_url = html_url + "download"
+ return dict(next_url=next_url, technique="osf-by-url")
+
+ # wiley
+ # https://onlinelibrary.wiley.com/doi/pdf/10.1111/1467-923X.12787
+ if "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
+ if b"/doi/pdfdirect/" in html_body:
+ next_url = html_url.replace("/doi/pdf/", "/doi/pdfdirect/")
+ return dict(next_url=next_url, technique="wiley-pdfdirect")
+
+ # arxiv abstract pages
+ if "://arxiv.org/abs/" in html_url:
+ url = html_url.replace("/abs/", "/pdf/")
+ return dict(pdf_url=url, technique="arxiv-url")
+
+ # american archivist (OA)
+ # https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630
+ if "://americanarchivist.org/doi/" in html_url and "/doi/pdf" not in html_url:
+ # use a more aggressive direct guess to avoid rate-limiting...
+ if "/doi/10." in html_url:
+ url = html_url.replace("/doi/10.", "/doi/pdf/10.")
+ return dict(pdf_url=url, technique="archivist-url")
+ # <a href="/doi/pdf/10.17723/aarc.62.2.j475270470145630" target="_blank">
+ hrefs = soup.find_all("a", attrs={"target": "_blank"})
+ for href in hrefs:
+ url = href["href"].strip()
+ if "/doi/pdf/" in url:
+ if url.startswith("http"):
+ return dict(pdf_url=url, technique="publisher-href")
+ elif url.startswith("/"):
+ return dict(pdf_url=host_prefix + url, technique="publisher-href")
+
+ # protocols.io
+ # https://www.protocols.io/view/flow-cytometry-protocol-mgdc3s6
+ if "://www.protocols.io/view/" in html_url and not html_url.endswith(".pdf"):
+ url = html_url + ".pdf"
+ return dict(pdf_url=url, technique="protocolsio-url")
+
+ # degruyter.com
+ # https://www.degruyter.com/view/books/9783486594621/9783486594621-009/9783486594621-009.xml
+ if "://www.degruyter.com/view/" in html_url and html_url.endswith(".xml"):
+ url = html_url.replace("/view/", "/downloadpdf/").replace(".xml", ".pdf")
+ return dict(pdf_url=url, technique="degruyter-url")
+
+ # journals.lww.com (Wolters Kluwer)
+ # https://journals.lww.com/spinejournal/Abstract/publishahead/Making_the_Most_of_Systematic_Reviews_and.94318.aspx
+ # DISABLED: they seem to redirect our crawler back to a "Fulltext" page and
+ # we never get the content.
+ if "://journals.lww.com/" in html_url and False:
+ # data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw=="
+ for line in html_body.split(b"\n"):
+ if b"data-pdf-url=" in line:
+ line = line.decode("utf-8")
+ url = line.strip().replace("data-pdf-url=", "").replace('"', "")
+ if url.startswith("http") and "pdfs.journals.lww.com" in url:
+ return dict(pdf_url=url, technique="journals.lww.com-jsvar")
+
+ # www.ahajournals.org
+ # https://www.ahajournals.org/doi/10.1161/circ.110.19.2977
+ if "://www.ahajournals.org/doi/" in html_url and "/doi/pdf/" not in html_url:
+ # <a href="/doi/pdf/10.1161/circ.110.19.2977?download=true">PDF download</a>
+ if b"/doi/pdf/10." in html_body:
+ url = html_url.replace("/doi/10.", "/doi/pdf/10.")
+ url = url + "?download=true"
+ return dict(pdf_url=url, technique="ahajournals-url")
+
+ # ehp.niehs.nih.gov
+ # https://ehp.niehs.nih.gov/doi/full/10.1289/EHP4709
+ # https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51
+ if "://ehp.niehs.nih.gov/doi/" in html_url:
+ # <a href="/doi/pdf/10.1289/EHP4709" target="_blank">
+ if b"/doi/pdf/10." in html_body:
+ url = html_url.replace("/doi/full/10.", "/doi/pdf/10.").replace(
+ "/doi/10.", "/doi/pdf/10."
+ )
+ return dict(pdf_url=url, technique="ehp.niehs.nigh.gov-url")
+
+ # cogentoa.com
+ # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873
+ if "://www.cogentoa.com/article/" in html_url and ".pdf" not in html_url:
+ # blech, it's a SPA! All JS
+ # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873.pdf
+ url = html_url + ".pdf"
+ return dict(pdf_url=url, technique="cogentoa-url")
+
+ # chemrxiv.org (likely to be other figshare domains also)
+ # https://chemrxiv.org/articles/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives_The_Hidden_Nature_of_Dasatinib/10101419
+ if "://chemrxiv.org/articles/" in html_url or ".figshare.org/articles/" in html_url:
+ # <script id="app-data" type="text/json"> [...] </script>
+ json_tag = soup.find("script", id="app-data", attrs={"type": "text/json"})
+ if json_tag and json_tag.string:
+ app_data = json.loads(json_tag.string)
+ # "exportPdfDownloadUrl": "https://s3-eu-west-1.amazonaws.com/itempdf74155353254prod/10101419/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives__The_Hidden_Nature_of_Dasatinib_v1.pdf"
+ url = app_data.get("article", {}).get("exportPdfDownloadUrl")
+ if url and url.startswith("http"):
+ return dict(pdf_url=url, technique="figshare-json")
+
+ # CNKI COVID-19 landing pages
+ # http://en.gzbd.cnki.net/gzbt/detail/detail.aspx?FileName=HBGF202002003&DbName=GZBJ7920&DbCode=GZBJ
+ if "://en.gzbd.cnki.net/KCMS/detail/detail.aspx" in html_url:
+ # <a onclick="WriteKrsDownLog()" target="_blank" id="pdfDown" name="pdfDown" href="/gzbt/download.aspx?filename=4Q1ZYpFdKFUZ6FDR1QkRrolayRXV2ZzattyQ3QFa2JXTyZXUSV3QRFkbndzaGV2KyJXWZVEbFdVYnZndD9EOxg1Tj5Eeys2SMFzLZ5kcuFkM3dEbsR2ZjxEaShVdJhFdp90KhlVVzcjVVlXUVNHWBtWS5Rlb5cnc&amp;tablename=GZBJLAST2020&amp;dflag=pdfdown&#xA; "><i></i>PDF Download</a>
+ href = soup.find("a", attrs={"id": "pdfDown"})
+ if href:
+ url = href["href"].strip().replace("&#xA;", "")
+ if not url.startswith("http"):
+ url = host_prefix + url
+ return dict(pdf_url=url, technique="cnki-href")
+
+ # RWTH AACHEN repository
+ if "://publications.rwth-aachen.de/record/" in html_url:
+ record_id = html_url.split("/")[-1]
+ url = f"{html_url}/files/{record_id}.pdf"
+ if record_id.isdigit() and url.encode("utf-8") in html_body:
+ return dict(pdf_url=url, technique="rwth-aachen-url")
+
+ # physchemaspects.ru
+ if "://physchemaspects.ru/" in html_url and soup:
+ for href in soup.find_all("a"):
+ if href.text == "download PDF file":
+ url = href["href"]
+ if url.startswith("/"):
+ url = host_prefix + url
+ return dict(pdf_url=url, technique="physchemaspects-href")
+
+ # OJS 3 (some)
+ if meta_generator and meta_generator.startswith("Open Journal Systems"):
+ href = soup.find("a", attrs={"class": "obj_galley_link file"})
+ if href and href.text and "pdf" in href.text.lower():
+ url = href["href"].strip()
+ if url.startswith("/"):
+ url = host_prefix + url
+ return dict(pdf_url=url, technique="ojs-galley-href")
+
+ # ETH zurich e-periodica
+ if "://www.e-periodica.ch/digbib/view" in html_url:
+ url = html_url.replace("digbib/view", "cntmng").split("#")[0]
+ if url.encode("utf-8") in html_body:
+ return dict(pdf_url=url, technique="href-eperiodica")
+
+ # JMIR
+ # https://mhealth.jmir.org/2020/7/e17891/
+ if ".jmir.org/" in html_url and "/pdf" not in html_url and html_url.endswith("/"):
+ url = html_url + "pdf"
+ return dict(pdf_url=url, technique="jmir-url")
+
+ # Google Drive
+ # this is assuming it is a PDF
+ if "drive.google.com/file/d/" in html_url and "/view" in html_url:
+ gdrive_id = html_url.split("/")[5]
+ if len(gdrive_id) > 10:
+ # https://drive.google.com/uc?export=download&id=15DnbNMZTbRHHqKj8nFaikGSd1-OyoJ24
+ return dict(
+ pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}",
+ technique="google-drive",
+ )
+
+ # https://doi.org/10.24850/j-tyca-14-4-7
+ # https://docs.google.com/viewer?url=http://revistatyca.org.mx/index.php/tyca/libraryFiles/downloadPublic/150
+ if "docs.google.com/viewer?url=" in html_url:
+ original_url = html_url.split("?url=")[1]
+ if original_url:
+ return dict(pdf_url=original_url, technique="docs.google.com viewer")
+
+ ### below here we are doing guesses
+
+ # generic guess: try current URL plus .pdf, if it exists in the HTML body
+ if ".pdf" not in html_url:
+ url = html_url + ".pdf"
+ if url.encode("utf-8") in html_body:
+ return dict(pdf_url=url, technique="guess-url-plus-pdf")
+
+ return dict()
+
+
+def test_regex() -> None:
+ lines = """
+ blah
+ var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
+ asdf"""
+ m = OVID_JOURNAL_URL_REGEX.search(lines)
+ assert m
+ assert (
+ m.group(1)
+ == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
+ )
+
+ lines = """
+ window.onload = function () {
+ window.location = 'https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client';
+ refreshOriginalWindow();
+ }
+ """
+ url = "https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client"
+ m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(lines)
+ assert m
+ assert m.group(1) == url
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
new file mode 100644
index 0000000..1e2d197
--- /dev/null
+++ b/python/sandcrawler/html_metadata.py
@@ -0,0 +1,1077 @@
+import datetime
+import sys
+import urllib.parse
+from typing import Any, Dict, List, Optional, Tuple
+
+import braveblock
+import dateparser
+import pydantic
+from selectolax.parser import HTMLParser
+
+from sandcrawler.misc import url_fuzzy_equal
+
+# this is a map of metadata keys to CSS selectors
+# sources for this list include:
+# - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
+# - inspection of actual publisher HTML
+# - http://div.div1.com.au/div-thoughts/div-commentaries/66-div-commentary-metadata
+# - "HTML meta tags used by journal articles"
+# https://gist.github.com/hubgit/5985963
+# order of these are mostly by preference/quality (best option first), though
+# also/sometimes re-ordered for lookup efficiency (lookup stops after first
+# match)
+HEAD_META_PATTERNS: Dict[str, List[str]] = {
+ "title": [
+ "meta[name='citation_title']",
+ "meta[name='eprints.title']",
+ "meta[name='prism.title']",
+ "meta[name='bepress_citation_title']",
+ "meta[name='og:title']",
+ "meta[name='dcterms.title']",
+ "meta[name='dc.title']",
+ ],
+ "subtitle": [
+ "meta[name='prism.subtitle']",
+ ],
+ "doi": [
+ "meta[name='citation_doi']",
+ "meta[name='DOI']",
+ "meta[id='DOI']",
+ "meta[name='prism.doi']",
+ "meta[name='bepress_citation_doi']",
+ "meta[name='dc.identifier.doi']",
+ "meta[name='dc.identifier'][scheme='doi']",
+ ],
+ "pmid": [
+ "meta[name='citation_pmid']",
+ ],
+ "abstract": [
+ "meta[name='citation_abstract']",
+ "meta[name='bepress_citation_abstract']",
+ "meta[name='eprints.abstract']",
+ "meta[name='dcterms.abstract']",
+ "meta[name='prism.teaser']",
+ "meta[name='dc.description']",
+ "meta[name='og:description']",
+ ],
+ "container_name": [
+ "meta[name='citation_journal_title']",
+ "meta[name='bepress_citation_journal_title']",
+ "meta[name='citation_conference_title']",
+ "meta[name='bepress_citation_conference_title']",
+ "meta[name='prism.publicationName']",
+ "meta[name='eprints.publication']",
+ "meta[name='dc.relation.ispartof']",
+ "meta[name='dc.source']",
+ "meta[property='og:site_name']",
+ ],
+ "container_abbrev": [
+ "meta[name='citation_journal_abbrev']",
+ ],
+ "raw_date": [
+ "meta[name='citation_publication_date']",
+ "meta[name='bepress_citation_publication_date']",
+ "meta[name='prism.publicationDate']",
+ "meta[name='citation_date']",
+ "meta[name='bepress_citation_date']",
+ "meta[name='citation_online_date']",
+ "meta[name='bepress_citation_online_date']",
+ "meta[itemprop='datePublished']",
+ "meta[name='article:published']",
+ "meta[name='eprints.datestamp']",
+ "meta[name='eprints.date']",
+ "meta[name='dc.date.created']",
+ "meta[name='dc.issued']",
+ "meta[name='dcterms.date']",
+ "meta[name='dc.date']",
+ ],
+ "release_year": [
+ "meta[itemprop='citation_year']",
+ "meta[itemprop='prism:copyrightYear']",
+ ],
+ "first_page": [
+ "meta[name='citation_firstpage']",
+ "meta[name='bepress_citation_firstpage']",
+ "meta[name='prism.startingPage']",
+ "meta[name='dc.citation.spage']",
+ ],
+ "last_page": [
+ "meta[name='citation_lastpage']",
+ "meta[name='bepress_citation_lastpage']",
+ "meta[name='prism.endingPage']",
+ "meta[name='dc.citation.epage']",
+ ],
+ "issue": [
+ "meta[name='citation_issue']",
+ "meta[name='bepress_citation_issue']",
+ "meta[name='prism.issueIdentifier']",
+ "meta[name='dc.citation.issue']",
+ ],
+ "volume": [
+ "meta[name='citation_volume']",
+ "meta[name='bepress_citation_volume']",
+ "meta[name='prism.volume']",
+ "meta[name='dc.citation.volume']",
+ ],
+ "number": [
+ "meta[name='citation_technical_report_number']",
+ "meta[name='bepress_citation_technical_report_number']",
+ "meta[name='citation_number']",
+ "meta[name='bepress_citation_number']",
+ "meta[name='prism.number']",
+ ],
+ "container_issn": [
+ "meta[name='citation_issn']",
+ "meta[name='bepress_citation_issn']",
+ "meta[name='prism.issn']",
+ "meta[name='prism.eIssn']",
+ "meta[name='eprints.issn']",
+ "meta[name='dc.source.issn']",
+ ],
+ "isbn": [
+ "meta[name='citation_isbn']",
+ "meta[name='bepress_citation_isbn']",
+ "meta[name='prism.isbn']",
+ ],
+ "publisher": [
+ "meta[name='citation_publisher']",
+ "meta[name='bepress_citation_publisher']",
+ "meta[name='eprints.publisher']",
+ "meta[name='citation_technical_report_institution']",
+ "meta[name='dcterms.publisher']",
+ "meta[name='dc.publisher']",
+ ],
+ "raw_release_type": [
+ "meta[name='citation_article_type']",
+ "meta[name='bepress_citation_article_type']",
+ "meta[name='prism.contentType']",
+ "meta[name='eprints.type']",
+ "meta[name='dc.type']",
+ ],
+ "lang": [
+ "meta[name='citation_language']",
+ "meta[name='bepress_citation_language']",
+ "meta[name='dcterms.language']",
+ "meta[name='dc.language']",
+ "meta[name='og:locale']",
+ ],
+}
+
+HEAD_META_LIST_PATTERNS: Dict[str, List[str]] = {
+ "contrib_names": [
+ "meta[name='citation_author']",
+ "meta[name='bepress_citation_author']",
+ "meta[name='eprints.creators_name']",
+ "meta[name='dcterms.creator']",
+ "meta[name='article:author']",
+ "meta[name='dc.creator']",
+ "meta[name='dc.contributor']",
+ ],
+ # TODO: citation_author_institution
+ "raw_references": [
+ "meta[name='citation_reference']",
+ ],
+ "raw_identifiers": [
+ "meta[name='eprints.id_number']",
+ "meta[name='dcterms.identifier']",
+ "meta[name='dc.identifier']",
+ ],
+}
+
+XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
+ {
+ "selector": "meta[name='citation_xml_url']",
+ "attr": "content",
+ "technique": "citation_xml_url",
+ },
+ {
+ "selector": "meta[name='fulltext_xml']",
+ "attr": "content",
+ "technique": "fulltext_xml",
+ },
+ {
+ "selector": "link[rel='alternate'][type='application/xml']",
+ "attr": "href",
+ "technique": "alternate link",
+ },
+ {
+ "selector": "link[rel='alternate'][type='text/xml']",
+ "attr": "href",
+ "technique": "alternate link",
+ },
+ {
+ "in_doc_url": "scielo",
+ "in_fulltext_url": "articleXML",
+ "selector": "a[target='xml']",
+ "attr": "href",
+ "technique": "SciElo XML link",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "viewXML",
+ "selector": "a[class='obj_galley_link']",
+ "attr": "href",
+ "technique": "OJS Gallery XML link",
+ },
+ {
+ "in_fulltext_url": "/download/xml/",
+ "selector": "a[title='XML']",
+ "attr": "href",
+ "technique": "ARPHA XML link",
+ "example_page": "https://zookeys.pensoft.net/article/26391",
+ },
+ {
+ "in_doc_url": "frontiersin.org/",
+ "in_fulltext_url": "xml",
+ "selector": "a.download-files-nlm",
+ "attr": "href",
+ "technique": "XML (NLM) download link (frontiersin.org)",
+ "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+ },
+]
+
+HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
+ {
+ "selector": "meta[name='citation_fulltext_html_url']",
+ "attr": "content",
+ "technique": "citation_fulltext_html_url",
+ },
+ {
+ "selector": "link[rel='alternate'][type='text/html']",
+ "attr": "href",
+ "technique": "alternate link",
+ },
+ {
+ "in_doc_url": "/article/view/",
+ "in_fulltext_url": "inline=1",
+ "selector": "iframe[name='htmlFrame']",
+ "attr": "src",
+ "technique": "OJS HTML iframe",
+ },
+ {
+ "in_doc_url": "dovepress.com",
+ "in_fulltext_url": "-fulltext-",
+ "selector": "a[id='view-full-text']",
+ "attr": "href",
+ "technique": "dovepress fulltext link",
+ },
+ {
+ "in_doc_url": "://doaj.org/article/",
+ "selector": "section.col-md-8 a[target='_blank'].button--primary",
+ "attr": "href",
+ "technique": "doaj.org access link",
+ },
+]
+
+COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
+ {
+ "in_doc_url": "pensoft.net/article/", # also /element/
+ "in_fulltext_url": "/download/fig/",
+ "selector": ".Main-Content .figure a.P-Article-Preview-Picture-Download-Small",
+ "attr": "href",
+ "technique": "Active figure download link (zookeys)",
+ "example_page": "https://zookeys.pensoft.net/article/38576/element/2/153/",
+ },
+ {
+ "in_doc_url": "/file.xhtml?persistentId",
+ "in_fulltext_url": "/access/datafile/",
+ "selector": "div.form-group code",
+ "use_body": "true",
+ "technique": "Dataverse 'download URL'",
+ "example_page": "https://data.lipi.go.id/file.xhtml?persistentId=hdl:20.500.12690/RIN/IDDOAH/BTNH25&version=1.0",
+ },
+]
+
+# This is a database of matching patterns. Most of these discovered by hand,
+# looking at OA journal content that failed to craw/ingest.
+PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
+ {
+ "selector": "head meta[name='citation_pdf_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url",
+ },
+ {
+ "selector": "head meta[name='bepress_citation_pdf_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url",
+ },
+ {
+ "in_doc_url": "journals.lww.com",
+ "selector": "head meta[name='wkhealth_pdf_url']",
+ "attr": "content",
+ "technique": "wkhealth_pdf_url",
+ "example_page": "https://journals.lww.com/otainternational/Fulltext/2019/03011/Trauma_systems_in_North_America.2.aspx",
+ },
+ {
+ "selector": "head meta[property='citation_pdf_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url",
+ # eg, researchgate
+ },
+ {
+ "selector": "head meta[name='eprints.document_url']",
+ "attr": "content",
+ "technique": "citation_pdf_url (property)",
+ },
+ {
+ "in_doc_url": "/doi/10.",
+ "in_fulltext_url": "/doi/pdf/",
+ "selector": "a.show-pdf",
+ "attr": "href",
+ "technique": "SAGE/UTP show-pdflink",
+ "example_page": "https://journals.sagepub.com/doi/10.1177/2309499019888836",
+ # also http://utpjournals.press/doi/10.3138/cjh.ach.54.1-2.05
+ },
+ {
+ "in_doc_url": "/doi/10.",
+ "in_fulltext_url": "/doi/pdf/",
+ "selector": "a[title='PDF']",
+ "attr": "href",
+ "technique": "title=PDF link",
+ "example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379",
+ },
+ {
+ "in_doc_url": "/view/",
+ "selector": "a#pdfDownloadLink",
+ "attr": "href",
+ "technique": "OJS pdfDownloadLink link",
+ "example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336",
+ },
+ {
+ "in_fulltext_url": "/pdf/",
+ "selector": "a.show-pdf",
+ "attr": "href",
+ "technique": "SAGE PDF link",
+ "example_page": "http://journals.sagepub.com/doi/pdf/10.1177/2309499019888836",
+ },
+ {
+ "in_doc_url": "://elifesciences.org/articles/",
+ "in_fulltext_url": "/download/",
+ "selector": "a[data-download-type='pdf-article']",
+ "attr": "href",
+ "technique": "eLife PDF link",
+ "example_page": "https://elifesciences.org/articles/59841",
+ },
+ {
+ "in_doc_url": "://www.jcancer.org/",
+ "in_fulltext_url": ".pdf",
+ "selector": ".divboxright a.text-button",
+ "attr": "href",
+ "technique": "jcancer PDF link",
+ "example_page": "https://www.jcancer.org/v10p4038.htm",
+ },
+ {
+ "in_doc_url": "://www.tandfonline.com/doi/full/10.",
+ "in_fulltext_url": "/pdf/",
+ "selector": "a.show-pdf",
+ "attr": "href",
+ "technique": "t+f show-pdf link",
+ "example_page": "https://www.tandfonline.com/doi/full/10.1080/19491247.2019.1682234",
+ },
+ {
+ "in_doc_url": "article_id=",
+ "in_fulltext_url": "download.php",
+ "selector": "a.file.pdf",
+ "attr": "href",
+ "technique": "pdf file link",
+ "example_page": "http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405",
+ },
+ {
+ "in_doc_url": "/content/10.",
+ "in_fulltext_url": "pdf",
+ "selector": "a.pdf[title='Download']",
+ "attr": "href",
+ "technique": "pdf file link",
+ "example_page": "https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.11.2000230",
+ },
+ {
+ "selector": "embed[type='application/pdf']",
+ "attr": "src",
+ "technique": "PDF embed",
+ "example_page": "http://www.jasstudies.com/DergiTamDetay.aspx?ID=3401",
+ },
+ {
+ "in_doc_url": "/html/",
+ "in_fulltext_url": "create_pdf",
+ "selector": ".AbsPdfFigTab img[src='images/pdf-icon.jpg'] + a",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.aed.org.cn/nyzyyhjxb/html/2018/4/20180408.htm",
+ },
+ {
+ "in_doc_url": "/archive-detail/",
+ "in_fulltext_url": ".pdf",
+ "selector": ".contact-list a.download-pdf",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.bezmialemscience.org/archives/archive-detail/article-preview/editorial/20439",
+ },
+ {
+ "in_doc_url": "degruyter.com/document/",
+ "in_fulltext_url": "/pdf",
+ "selector": "a.downloadPdf",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html",
+ },
+ {
+ "in_doc_url": "repositorio.unicamp.br/handle/",
+ "in_fulltext_url": "/bitstream/",
+ "selector": "table.panel-body a[target='_blank']",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.repositorio.unicamp.br/handle/REPOSIP/287750",
+ },
+ {
+ "in_doc_url": "dlc.library.columbia.edu/durst/",
+ "selector": "dd.blacklight-lib_non_item_in_context_url_ssm a[href]",
+ "attr": "href",
+ "technique": "Access URL link",
+ "example_page": "https://dlc.library.columbia.edu/durst/cul:18931zcrk9",
+ },
+ {
+ "in_doc_url": "fldeploc.dep.state.fl.us/geodb_query/fgs_doi",
+ "in_fulltext_url": "pdf",
+ "selector": "p a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29",
+ },
+ {
+ "in_doc_url": "preprints.jmir.org/preprint/",
+ "selector": "a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://preprints.jmir.org/preprint/22556",
+ },
+ {
+ "in_doc_url": "bloomsburycollections.com/",
+ "in_fulltext_url": "pdf",
+ "selector": "li.download-item a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries",
+ },
+ {
+ "in_doc_url": "emerald.com/insight/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.intent_pdf_link",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html",
+ },
+ {
+ "in_doc_url": "ingentaconnect.com/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.pdf[data-popup]",
+ "attr": "data-popup",
+ "technique": "PDF URL link",
+ "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007",
+ },
+ {
+ "in_doc_url": "library.wur.nl/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.wl_full_text_restricted",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://library.wur.nl/WebQuery/wurpubs/529922",
+ },
+ {
+ "in_doc_url": "/dlibra/",
+ "in_fulltext_url": "pdf",
+ "selector": "iframe#js-main-frame",
+ "attr": "src",
+ "technique": "PDF iframe (dlibra)",
+ "example_page": "https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031",
+ },
+ {
+ "in_doc_url": "/handle/",
+ "in_fulltext_url": "pdf",
+ "selector": "table.misc table.inner tr.b a",
+ "attr": "href",
+ "technique": "PDF URL link (DSpace, first file)",
+ "example_page": "https://orbi.uliege.be/handle/2268/174200",
+ },
+ {
+ "in_doc_url": "/publications/",
+ "in_fulltext_url": "pdf",
+ "selector": ".publication-sidebar li.open-access a.document-link",
+ "attr": "href",
+ "technique": "PDF URL link (Pure repo, OA link)",
+ "example_page": "https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance",
+ },
+ {
+ "in_doc_url": "//hal",
+ "selector": ".widget-openaccess .widget-content a",
+ "attr": "href",
+ "technique": "Fulltext OA URL (HAL)",
+ "example_page": "https://hal.archives-ouvertes.fr/hal-00744951",
+ },
+ {
+ "in_doc_url": "/record/",
+ "in_fulltext_url": "pdf",
+ "selector": "#detailedrecordminipanelfile a",
+ "attr": "href",
+ "technique": "PDF URL link (Invenio)",
+ "example_page": "https://bib-pubdb1.desy.de/record/416556",
+ },
+ {
+ "in_doc_url": "/available/",
+ "in_fulltext_url": "pdf",
+ "selector": "table.file-table a",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://etd.adm.unipi.it/theses/available/etd-05302014-183910/",
+ },
+ {
+ "in_doc_url": "/islandora/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.islandora-pdf-link",
+ "attr": "href",
+ "technique": "PDF URL link (Islandora)",
+ "example_page": "http://fau.digital.flvc.org/islandora/object/fau%3A9804",
+ },
+ {
+ "in_doc_url": "/receive/",
+ "in_fulltext_url": "pdf",
+ "selector": ".mir-preview noscript a",
+ "attr": "href",
+ "technique": "PDF iframe via noscript (MyCoRe)",
+ "example_page": "https://www.db-thueringen.de/receive/dbt_mods_00005191",
+ },
+ {
+ "in_doc_url": "/registro.do",
+ "in_fulltext_url": "imagenes",
+ "selector": ".resumen_bib a[data-analytics=media]",
+ "attr": "href",
+ "technique": "Media link (DIGIBIS)",
+ "example_page": "https://bivaldi.gva.es/es/consulta/registro.do?id=11740",
+ },
+ {
+ "in_doc_url": "/view",
+ "in_fulltext_url": "/at_download/",
+ "selector": ".documentContent #content a",
+ "attr": "href",
+ "technique": "Media link (Plone)",
+ "example_page": "http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view",
+ },
+ {
+ "in_doc_url": "isca-speech.org/",
+ "in_fulltext_url": "pdf",
+ "selector": ".w3-container a",
+ "attr": "href",
+ "technique": "PDF URL link (isca-speech.org)",
+ "example_page": "https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html",
+ },
+ {
+ "in_doc_url": "://repository.dri.ie/",
+ "in_fulltext_url": "/download",
+ "selector": "#dri_download_assets > div > a",
+ "attr": "href",
+ "technique": "Download link (repository.dri.ie)",
+ "example_page": "https://repository.dri.ie/catalog/qf8621102",
+ },
+ {
+ "in_doc_url": "frontiersin.org/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.download-files-pdf",
+ "attr": "href",
+ "technique": "PDF Download link (frontiersin.org)",
+ "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+ },
+ {
+ "in_doc_url": "cureus.com/",
+ "in_fulltext_url": "pdf",
+ "selector": ".small-medium-pdf a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF Download link (cureus.com)",
+ "example_page": "https://www.cureus.com/articles/69542-tramadol-induced-jerks",
+ },
+ {
+ "in_doc_url": "e-manuscripta.ch/",
+ "in_fulltext_url": "pdf",
+ "selector": "#titleinfoPdfDownload a.resourceLink",
+ "attr": "href",
+ "technique": "PDF Download link (e-manuscripta.ch)",
+ "example_page": "https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176",
+ },
+ {
+ "in_doc_url": "journals.uchicago.edu",
+ "in_fulltext_url": "pdf",
+ "selector": "nav.article__navbar a.ctrl--pdf",
+ "attr": "href",
+ "technique": "PDF Download link (journals.uchicago.edu)",
+ "example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008",
+ },
+ {
+ "in_doc_url": "integrityresjournals.org",
+ "in_fulltext_url": "/article-full-text-pdf/",
+ "selector": "a[target='_blank'].btn-danger",
+ "attr": "href",
+ "technique": "PDF Download link (integrityresjournals.org)",
+ "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/download/",
+ "selector": "body.pkp_page_article a.download",
+ "attr": "href",
+ "technique": "OJS PDF Embed",
+ "example_page": "https://periodicals.karazin.ua/language_teaching/article/view/12543/11957",
+ },
+ {
+ "in_doc_url": "/article/view/",
+ "in_fulltext_url": "/article/",
+ "selector": "a.pdf",
+ "attr": "href",
+ "technique": "OJS PDF link",
+ },
+ {
+ "in_doc_url": "scitemed.com/article/",
+ "in_fulltext_url": ".pdf",
+ "selector": "li.tab_pdf_btn a",
+ "attr": "href",
+ "technique": "PDF link (scitemed.com)",
+ },
+ {
+ "in_doc_url": "://doaj.org/article/",
+ "selector": "section.col-md-8 a[target='_blank'].button--primary",
+ "attr": "href",
+ "technique": "doaj.org access link",
+ },
+ {
+ "in_doc_url": "/jvi.aspx",
+ "in_fulltext_url": "download_fulltext",
+ "selector": "div.siteMainWrapper div.siteArticleShare a[target='_blank'].list-group-item",
+ "attr": "href",
+ "technique": "erciyesmedj.com publication system PDF download link",
+ },
+ {
+ "selector": "body embed[alt='pdf']",
+ "attr": "src",
+ "technique": "embed PDF",
+ "example_pdf": "https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913",
+ },
+ {
+ "in_fulltext_url": "viewPDFInterstitial",
+ "in_doc_url": "/view/",
+ "selector": "frameset frame",
+ "attr": "src",
+ "technique": "PDF iframe (viewPDFInterstitial)",
+ "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+ },
+ {
+ # note this one has a special handler
+ "in_doc_url": "viewPDFInterstitial",
+ "in_fulltext_url": "://",
+ "selector": "head meta[http-equiv='refresh']",
+ "attr": "content",
+ "technique": "HTML meta refresh (viewPDFInterstitial)",
+ "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+ },
+ {
+ "in_doc_url": "dlib.si/details/",
+ "in_fulltext_url": "PDF",
+ "selector": "body #FilesBox a",
+ "attr": "href",
+ "technique": "dlib.si download links",
+ "example_page": "https://www.dlib.si/details/URN:NBN:SI:DOC-WR9GTSCJ",
+ },
+ {
+ "in_doc_url": "filclass.ru",
+ "in_fulltext_url": "pdf",
+ "selector": "main .pdf-article a.pdficon",
+ "attr": "href",
+ "technique": "filclass.ru PDF link",
+ "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism",
+ },
+ {
+ "in_doc_url": "cdnsciencepub.com",
+ "in_fulltext_url": "pdf",
+ "selector": "article .info-panel a.btn--pdf",
+ "attr": "href",
+ "technique": "cdnsciencepub.com PDF link",
+ "example_page": "https://cdnsciencepub.com/doi/10.1139/AS-2022-0011",
+ },
+ {
+ "in_doc_url": "grrjournal.com",
+ "in_fulltext_url": "pdf",
+ "selector": ".ereaders-main-section a[download]",
+ "attr": "href",
+ "technique": "grrjournal.com PDF link",
+ "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "pdf",
+ "selector": "#articleFullText a.remote_pdf",
+ "attr": "href",
+ "technique": "OJS remote_pdf link",
+ "example_page": "https://www.mediterranea-comunicacion.org/article/view/22240",
+ },
+ {
+ "in_doc_url": "worldscientific.com/doi/abs/",
+ "in_fulltext_url": "/reader/",
+ "selector": "article.container .single__download a",
+ "attr": "href",
+ "technique": "worldscientific landing pages",
+ "example_page": "https://www.worldscientific.com/doi/abs/10.1142/S0116110521500098",
+ },
+ {
+ "in_doc_url": "worldscientific.com/doi/",
+ "in_fulltext_url": "/pdf/",
+ "selector": "noscript a[target='_blank']",
+ "attr": "href",
+ "technique": "worldscientific reader",
+ "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098",
+ },
+ {
+ "in_fulltext_url": "pdf",
+ "selector": ".container .view-content .download-article a",
+ "attr": "href",
+ "technique": "generic download article button",
+ "example_page": "https://science.lpnu.ua/mmc/all-volumes-and-issues/volume-9-number-1-2022/pursuit-differential-game-many-pursuers-and-one",
+ },
+ {
+ "in_fulltext_url": "pdf",
+ "selector": "body a.download-pdf",
+ "attr": "href",
+ "technique": "generic download article button",
+ "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/view/",
+ "selector": "body .entry_details a.pdf",
+ "attr": "href",
+ "technique": "generic OJS/preprints",
+ "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/version/5022",
+ },
+ {
+ "in_doc_url": "/view/",
+ "in_fulltext_url": "/download/",
+ "selector": "body header a.download",
+ "attr": "href",
+ "technique": "generic OJS/preprints PDF Embed",
+ "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/9327",
+ },
+]
+
+FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
+ # wiley has a weird almost-blank page we don't want to loop on
+ "://onlinelibrary.wiley.com/doi/pdf/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "{'embed': '",
+]
+
+FULLTEXT_URL_PREFIX_SKIP: List[str] = [
+ "javascript:",
+ "about:",
+]
+
+RELEASE_TYPE_MAP: Dict[str, str] = {
+ "research article": "article-journal",
+ "text.serial.journal": "article-journal",
+}
+
+
+class BiblioMetadata(pydantic.BaseModel):
+ title: Optional[str]
+ subtitle: Optional[str]
+ contrib_names: Optional[List[str]]
+ release_date: Optional[datetime.date]
+ release_year: Optional[int]
+ release_type: Optional[str]
+ release_stage: Optional[str]
+ withdrawn_status: Optional[str]
+ lang: Optional[str]
+ country_code: Optional[str]
+ volume: Optional[str]
+ issue: Optional[str]
+ number: Optional[str]
+ pages: Optional[str]
+ first_page: Optional[str]
+ last_page: Optional[str]
+ license: Optional[str]
+ publisher: Optional[str]
+ container_name: Optional[str]
+ container_abbrev: Optional[str]
+ container_issn: Optional[str]
+ container_type: Optional[str]
+ raw_references: Optional[List[str]]
+
+ doi: Optional[str]
+ pmid: Optional[str]
+ pmcid: Optional[str]
+ isbn13: Optional[str]
+ publisher_ident: Optional[str]
+ oai_id: Optional[str]
+
+ abstract: Optional[str]
+ pdf_fulltext_url: Optional[str]
+ html_fulltext_url: Optional[str]
+ xml_fulltext_url: Optional[str]
+ component_url: Optional[str]
+
+ class Config:
+ json_encoders = {datetime.date: lambda dt: dt.isoformat()}
+
+
+def html_extract_fulltext_url(
+ doc_url: str, doc: HTMLParser, patterns: List[dict]
+) -> Optional[Tuple[str, str]]:
+ """
+ Tries to quickly extract fulltext URLs using a set of patterns. This
+ function is intendend to be generic across various extraction techniques.
+
+ Returns null or a tuple of (url, technique)
+ """
+ self_doc_url: Optional[Tuple[str, str]] = None
+ for pattern in patterns:
+ if "selector" not in pattern:
+ continue
+ if "in_doc_url" in pattern:
+ if pattern["in_doc_url"] not in doc_url:
+ continue
+ elem = doc.css_first(pattern["selector"])
+ if not elem:
+ continue
+ val = None
+ if "attr" in pattern:
+ val = elem.attrs.get(pattern["attr"])
+ # handle HTML redirect
+ if val and pattern["attr"] == "content" and "URL=" in val:
+ val = val.split("URL=")[1]
+ elif pattern.get("use_body"):
+ val = elem.text()
+ if "://" not in val:
+ continue
+ if not val:
+ continue
+ val = urllib.parse.urljoin(doc_url, val)
+ assert val
+ if "in_fulltext_url" in pattern:
+ if pattern["in_fulltext_url"] not in val:
+ continue
+ skip_matched = False
+ for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
+ if skip_pattern in val.lower():
+ skip_matched = True
+ break
+ if skip_matched:
+ continue
+ for skip_pattern in FULLTEXT_URL_PREFIX_SKIP:
+ if val.lower().startswith(skip_pattern):
+ skip_matched = True
+ break
+ if skip_matched:
+ continue
+ if url_fuzzy_equal(doc_url, val):
+ # don't link to self, unless no other options
+ self_doc_url = (val, pattern.get("technique", "unknown"))
+ continue
+
+ # quirks modes / hacks
+ if "drops.dagstuhl.de" in doc_url and val.endswith(".pdf/"):
+ val = val[:-1]
+
+ return (val, pattern.get("technique", "unknown"))
+ if self_doc_url:
+ print(" WARN: returning fulltext URL pointing to self", file=sys.stderr)
+ return self_doc_url
+ return None
+
+
+def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
+
+ meta: Any = dict()
+ head = doc.css_first("head")
+ if not head:
+ print(f"WARN: empty <head>? {doc_url}", file=sys.stderr)
+ return None
+
+ for field, patterns in HEAD_META_PATTERNS.items():
+ for pattern in patterns:
+ val = head.css_first(pattern)
+ # print((field, pattern, val))
+ if val and "content" in val.attrs and val.attrs["content"]:
+ meta[field] = val.attrs["content"]
+ break
+
+ for field, patterns in HEAD_META_LIST_PATTERNS.items():
+ for pattern in patterns:
+ val_list = head.css(pattern)
+ if val_list:
+ for val in val_list:
+ if "content" in val.attrs and val.attrs["content"]:
+ if field not in meta:
+ meta[field] = []
+ meta[field].append(val.attrs["content"])
+ break
+
+ # (some) fulltext extractions
+ pdf_fulltext_url = html_extract_fulltext_url(doc_url, doc, PDF_FULLTEXT_PATTERNS)
+ if pdf_fulltext_url:
+ meta["pdf_fulltext_url"] = pdf_fulltext_url[0]
+ xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS)
+ if xml_fulltext_url:
+ meta["xml_fulltext_url"] = xml_fulltext_url[0]
+ html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS)
+ if html_fulltext_url:
+ meta["html_fulltext_url"] = html_fulltext_url[0]
+ component_url = html_extract_fulltext_url(doc_url, doc, COMPONENT_FULLTEXT_PATTERNS)
+ if component_url:
+ meta["component_url"] = component_url[0]
+
+ # TODO: replace with clean_doi() et al
+ if meta.get("doi") and meta.get("doi").startswith("doi:"):
+ meta["doi"] = meta["doi"][4:]
+
+ raw_identifiers = meta.pop("raw_identifiers", [])
+ for ident in raw_identifiers:
+ if ident.startswith("doi:10."):
+ if "doi" not in meta:
+ meta["doi"] = ident.replace("doi:", "")
+ elif ident.startswith("10.") and "/" in ident:
+ if "doi" not in meta:
+ meta["doi"] = ident
+ elif ident.startswith("isbn:"):
+ if "isbn" not in meta:
+ meta["isbn"] = ident.replace("isbn:", "")
+
+ raw_date = meta.pop("raw_date", None)
+ if raw_date:
+ parsed = dateparser.parse(raw_date)
+ if parsed:
+ meta["release_date"] = parsed.date()
+
+ raw_release_type = meta.pop("raw_release_type", None)
+ if raw_release_type:
+ release_type = RELEASE_TYPE_MAP.get(raw_release_type.lower().strip())
+ if release_type:
+ meta["release_type"] = release_type
+
+ return BiblioMetadata(**meta)
+
+
+def load_adblock_rules() -> braveblock.Adblocker:
+ """
+ TODO: consider blocking very generic assets:
+ - ://fonts.googleapis.com/css*
+ - ://journals.plos.org/plosone/resource/img/icon.*
+ """
+ return braveblock.Adblocker(
+ include_easylist=True,
+ include_easyprivacy=True,
+ rules=[
+ "/favicon.ico^",
+ "||fonts.googleapis.com^",
+ "||widgets.figshare.com^",
+ "||crossmark-cdn.crossref.org^",
+ "||crossmark.crossref.org^",
+ "||platform.twitter.com^",
+ "||verify.nature.com^",
+ "||s7.addthis.com^",
+ "||www.mendeley.com^",
+ "||pbs.twimg.com^",
+ "||badge.dimensions.ai^",
+ "||recaptcha.net^",
+ "||tag.imagino.com^",
+ "||consent.cookiebot.com^",
+ "||recaptcha.net^",
+ # not sure about these CC badges (usually via a redirect)
+ # "||licensebuttons.net^",
+ # "||i.creativecommons.org^",
+ # Should we skip jquery, or other generic javascript CDNs?
+ # "||code.jquery.com^",
+ # "||ajax.googleapis.com^",
+ # "||cdnjs.cloudflare.com^",
+ # badges, "share" buttons, tracking, etc
+ "apis.google.com/js/plusone",
+ "www.google.com/recaptcha/",
+ "js/_getUACode.js"
+ # PLOS images
+ "/resource/img/icon.*.16.png^",
+ # CAIRN broken tracking tag
+ "cairn-int.info//about.php?cairn_guest=",
+ ],
+ )
+
+
+def _extract_generic(
+ doc: HTMLParser, selector: str, attrs: List[str], type_name: str
+) -> List[Dict[str, str]]:
+ resources = []
+
+ for node in doc.css(selector):
+ for attr in attrs:
+ if attr not in node.attrs:
+ continue
+ url = node.attrs.get(attr)
+ # special-case a couple meta URI prefixes which don't match with adblock rules
+ skip = False
+ for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:", "javascript:"]:
+ if url and url.startswith(prefix):
+ skip = True
+ break
+ if url and "/" not in url and "." not in url and " " in url:
+ # eg: "Ce fichier n'existe pas"
+ skip = True
+ if skip:
+ continue
+ if url and url.startswith("https://https://"):
+ url = url[8:]
+ elif url and url.startswith("http://http://"):
+ url = url[7:]
+ if url:
+ # print(url, file=sys.stderr)
+ resources.append(dict(url=url.strip(), type=type_name))
+
+ return resources
+
+
+def html_extract_resources(
+ doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker
+) -> List[Dict[str, str]]:
+ """
+ This function tries to find all the important resources in a page. The
+ presumption is that the HTML document is article fulltext, and we want the
+ list of all resources (by URL) necessary to replay the page.
+
+ The returned resource URLs each have a type (script, img, css, etc), and
+ should be fully-qualified URLs (not relative).
+
+ Adblock filtering is run to remove unwanted resources.
+ """
+ resources = []
+
+ # select various resource references
+ resources += _extract_generic(doc, "script", ["src"], "script")
+ resources += _extract_generic(doc, "link[rel='stylesheet']", ["href"], "stylesheet")
+ # TODO: srcset and parse
+ # eg: https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-small.jpg 355w
+ resources += _extract_generic(doc, "img", ["src"], "image")
+ resources += _extract_generic(doc, "audio", ["src"], "audio")
+ resources += _extract_generic(doc, "video", ["src"], "media")
+ resources += _extract_generic(doc, "source", ["src"], "media")
+ resources += _extract_generic(doc, "track", ["src"], "media")
+ resources += _extract_generic(doc, "iframe", ["src"], "subdocument")
+ resources += _extract_generic(doc, "embed", ["src"], "media")
+
+ # ensure URLs are absolute
+ for r in resources:
+ r["url"] = urllib.parse.urljoin(doc_url, r["url"])
+
+ # filter using adblocker
+ resources = [
+ r
+ for r in resources
+ if adblock.check_network_urls(r["url"], source_url=doc_url, request_type=r["type"])
+ is False
+ ]
+
+ # remove duplicates
+ resources = [dict(t) for t in {tuple(d.items()) for d in resources}]
+
+ return resources
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 365cf82..3ab4971 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -1,135 +1,1446 @@
-
# XXX: some broken MRO thing going on in here due to python3 object wrangling
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os, sys
+import datetime
+import gzip
+import http.client
+import json
+import os
+import sys
+import time
+import urllib.parse
+from collections import namedtuple
+from http.client import IncompleteRead
+from typing import Any, Dict, List, Optional, Tuple, Union
+
import requests
+import urllib3.exceptions
+
+# not sure this will really work. Should go before wayback imports.
+http.client._MAXHEADERS = 1000 # type: ignore
import wayback.exception
-from http.client import IncompleteRead
+from gwb.loader import CDXLoaderFactory3
from wayback.resourcestore import ResourceStore
-from gwb.loader import CDXLoaderFactory
+
+from .misc import b32_hex, clean_url, gen_file_metadata, requests_retry_session
+
+
+class SandcrawlerBackoffError(Exception):
+ """
+ A set of Exceptions which are raised through multiple abstraction layers to
+ indicate backpressure. For example, SPNv2 back-pressure sometimes needs to
+ be passed up through any timeout/retry code and become an actual long pause
+ or crash.
+ """
+
+ pass
+
+
+ResourceResult = namedtuple(
+ "ResourceResult",
+ [
+ "start_url",
+ "hit",
+ "status",
+ "terminal_url",
+ "terminal_dt",
+ "terminal_status_code",
+ "body",
+ "cdx",
+ "revisit_cdx",
+ ],
+)
+
+WarcResource = namedtuple(
+ "WarcResource",
+ [
+ "status_code",
+ "location",
+ "body",
+ "revisit_cdx",
+ ],
+)
+
+CdxRow = namedtuple(
+ "CdxRow",
+ [
+ "surt",
+ "datetime",
+ "url",
+ "mimetype",
+ "status_code",
+ "sha1b32",
+ "sha1hex",
+ "warc_csize",
+ "warc_offset",
+ "warc_path",
+ ],
+)
+
+CdxPartial = namedtuple(
+ "CdxPartial",
+ [
+ "surt",
+ "datetime",
+ "url",
+ "mimetype",
+ "status_code",
+ "sha1b32",
+ "sha1hex",
+ ],
+)
+
+
+def cdx_partial_from_row(row: Union[CdxRow, CdxPartial]) -> CdxPartial:
+ return CdxPartial(
+ surt=row.surt,
+ datetime=row.datetime,
+ url=row.url,
+ mimetype=row.mimetype,
+ status_code=row.status_code,
+ sha1b32=row.sha1b32,
+ sha1hex=row.sha1hex,
+ )
+
+
+def cdx_to_dict(cdx: Union[CdxRow, CdxPartial]) -> Dict[str, Any]:
+ d = {
+ "surt": cdx.surt,
+ "datetime": cdx.datetime,
+ "url": cdx.url,
+ "mimetype": cdx.mimetype,
+ "status_code": cdx.status_code,
+ "sha1b32": cdx.sha1b32,
+ "sha1hex": cdx.sha1hex,
+ }
+ if type(cdx) == CdxRow and "/" in cdx.warc_path:
+ d["warc_csize"] = cdx.warc_csize
+ d["warc_offset"] = cdx.warc_offset
+ d["warc_path"] = cdx.warc_path
+ return d
+
+
+def fuzzy_match_url(left: str, right: str) -> bool:
+ """
+ Matches URLs agnostic of http/https (and maybe other normalizations in the
+ future)
+ """
+ if left == right:
+ return True
+ if "://" in left and "://" in right:
+ left = "://".join(left.split("://")[1:])
+ right = "://".join(right.split("://")[1:])
+ if left == right:
+ return True
+ if left == right + "/" or right == left + "/":
+ return True
+ if left.replace("//", "/") == right.replace("//", "/"):
+ return True
+ return False
+
+
+def test_fuzzy_match_url() -> None:
+ assert fuzzy_match_url("http://thing.com", "http://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "https://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "ftp://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/") is True
+ assert fuzzy_match_url("https://thing.com", "http://thing.com/") is True
+ assert fuzzy_match_url("https://thing.com/", "http://thing.com") is True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") is False
+ assert (
+ fuzzy_match_url(
+ "https://www.cairn.info/static/images//logo-partners/logo-cnl-negatif.png",
+ "https://www.cairn.info/static/images/logo-partners/logo-cnl-negatif.png",
+ )
+ is True
+ )
+
+ # should probably handle these?
+ assert fuzzy_match_url("http://thing.com", "http://www.thing.com") is False
+ assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") is False
+ assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") is False
+
class CdxApiError(Exception):
pass
-class CdxApiClient:
- def __init__(self, host_url="https://web.archive.org/cdx/search/cdx"):
+class CdxApiClient:
+ def __init__(self, host_url: str = "https://web.archive.org/cdx/search/cdx", **kwargs):
self.host_url = host_url
+ self.http_session = requests_retry_session(retries=3, backoff_factor=3)
+ cdx_auth_token = kwargs.get("cdx_auth_token", os.environ.get("CDX_AUTH_TOKEN"))
+ if not cdx_auth_token:
+ raise Exception(
+ "CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)"
+ )
+ self.http_session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 sandcrawler.CdxApiClient",
+ "Cookie": "cdx_auth_token={}".format(cdx_auth_token),
+ }
+ )
- def lookup_latest(self, url):
+ def _query_api(self, params: Dict[str, str]) -> Optional[List[CdxRow]]:
"""
- Looks up most recent HTTP 200 record for the given URL.
-
- Returns a CDX dict, or None if not found.
-
- XXX: should do authorized lookup using cookie to get all fields
+ Hits CDX API with a query, parses result into a list of CdxRow
"""
-
- resp = requests.get(self.host_url, params={
- 'url': url,
- 'matchType': 'exact',
- 'limit': -1,
- 'filter': 'statuscode:200',
- 'output': 'json',
- })
+ resp = self.http_session.get(self.host_url, params=params)
if resp.status_code != 200:
- raise CDXApiError(resp.text)
+ raise CdxApiError(resp.text)
+ # print(resp.url, file=sys.stderr)
+ if not resp.text:
+ return None
rj = resp.json()
if len(rj) <= 1:
return None
- cdx = rj[1]
- assert len(cdx) == 7 # JSON is short
- cdx = dict(
- surt=cdx[0],
- datetime=cdx[1],
- url=cdx[2],
- mimetype=cdx[3],
- http_status=int(cdx[4]),
- sha1b32=cdx[5],
- sha1hex=b32_hex(cdx[5]),
- )
- return cdx
+ rows = []
+ for raw in rj[1:]:
+ # check number of CDX fields; there is a bug with some rows having
+ # spaces in WARC filename resulting in extra bogus fields
+ if len(raw) != 11:
+ raise CdxApiError(f"CDX response had {len(raw)} fields, not 11 expected")
+
+ # transform "-" ftp status code to a 226
+ status_code = None
+ if raw[4] == "-":
+ if raw[3] != "warc/revisit" and raw[2].startswith("ftp://"):
+ status_code = 226
+ else:
+ status_code = int(raw[4])
+
+ # remove CDX rows with no WARC records (?)
+ if raw[8] == "-" or raw[9] == "-" or raw[10] == "-":
+ continue
+
+ # remove CDX rows with SHA256 (not SHA1) digests
+ if raw[5].startswith("sha-256"):
+ continue
+
+ # remove CDX rows with 'error' digests
+ # TODO: follow-up on this (2022-11-01 sandcrawler errors)
+ if raw[5].lower() == "error":
+ continue
+
+ row = CdxRow(
+ surt=raw[0],
+ datetime=raw[1],
+ url=raw[2],
+ mimetype=raw[3],
+ status_code=status_code,
+ sha1b32=raw[5],
+ sha1hex=b32_hex(raw[5]),
+ warc_csize=int(raw[8]),
+ warc_offset=int(raw[9]),
+ warc_path=raw[10],
+ )
+ assert (row.mimetype == "-") or ("-" not in row)
+ rows.append(row)
+ return rows
+
+ def fetch(
+ self,
+ url: str,
+ datetime: str,
+ filter_status_code: Optional[int] = None,
+ retry_sleep: Optional[int] = None,
+ ) -> CdxRow:
+ """
+ Fetches a single CDX row by url/datetime. Raises a KeyError if not
+ found, because we expect to be looking up a specific full record.
+ """
+ if len(datetime) != 14:
+ raise ValueError(
+ "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime)
+ )
+ params: Dict[str, str] = {
+ "url": url,
+ "from": datetime,
+ "to": datetime,
+ "matchType": "exact",
+ "limit": "1",
+ "output": "json",
+ }
+ if filter_status_code:
+ params["filter"] = "statuscode:{}".format(filter_status_code)
+ resp = self._query_api(params)
+ if not resp:
+ if retry_sleep and retry_sleep > 0:
+ next_sleep = None
+ if retry_sleep > 3:
+ next_sleep = retry_sleep - 3
+ retry_sleep = 3
+ print(
+ " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
+ file=sys.stderr,
+ )
+ time.sleep(retry_sleep)
+ return self.fetch(
+ url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep
+ )
+ raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
+ row = resp[0]
+ # allow fuzzy http/https match
+ if not (fuzzy_match_url(row.url, url) and row.datetime == datetime):
+ if retry_sleep and retry_sleep > 0:
+ print(
+ " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep),
+ file=sys.stderr,
+ )
+ time.sleep(retry_sleep)
+ return self.fetch(
+ url, datetime, filter_status_code=filter_status_code, retry_sleep=None
+ )
+ raise KeyError(
+ "Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(
+ url, datetime, row
+ )
+ )
+ if filter_status_code:
+ assert row.status_code == filter_status_code
+ return row
+
+ def lookup_best(
+ self,
+ url: str,
+ max_age_days: Optional[int] = None,
+ best_mimetype: Optional[str] = None,
+ closest: Union[datetime.datetime, str, None] = None,
+ ) -> Optional[CdxRow]:
+ """
+ Fetches multiple CDX rows for the given URL, tries to find the most recent.
+
+ If no matching row is found, return None. Note this is different from fetch.
+
+ Preference order by status code looks like:
+
+ 200 or 226
+ mimetype match
+ not-liveweb
+ most-recent
+ no match
+ not-liveweb
+ most-recent
+ 3xx
+ most-recent
+ 4xx
+ most-recent
+ 5xx
+ most-recent
+
+ """
+ params: Dict[str, str] = {
+ "url": url,
+ "matchType": "exact",
+ "limit": "-40",
+ "output": "json",
+ # Collapsing seems efficient, but is complex; would need to include
+ # other filters and status code in filter
+ #'collapse': 'timestamp:6',
+ # Revisits now allowed and resolved!
+ #'filter': '!mimetype:warc/revisit',
+ }
+ if max_age_days:
+ since = datetime.date.today() - datetime.timedelta(days=max_age_days)
+ params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day)
+ closest_dt = "00000000"
+ if closest:
+ if isinstance(closest, datetime.datetime):
+ closest_dt = "%04d%02d%02d" % (closest.year, closest.month, closest.day)
+ params["closest"] = closest_dt
+ else:
+ closest_dt = closest
+ params["closest"] = closest_dt
+ params["sort"] = "closest"
+ # print(params, file=sys.stderr)
+ rows = self._query_api(params)
+ if not rows:
+ return None
+
+ def _cdx_sort_key(r: CdxRow) -> tuple:
+ """
+ This is a function, not a lambda, because it captures
+ best_mimetype. Will create a tuple that can be used to sort in
+ *reverse* order.
+ """
+ return (
+ int(r.url == url),
+ int(r.status_code in (200, 226)),
+ int(0 - (r.status_code or 999)),
+ int(r.mimetype == best_mimetype),
+ int(r.mimetype != "warc/revisit"),
+ r.datetime[:4] == closest_dt[:4],
+ int(r.datetime),
+ # NOTE: previously we demoted SPN records with this warc_path check ahead of datetime
+ int("/" in r.warc_path),
+ )
+
+ rows = sorted(rows, key=_cdx_sort_key)
+ return rows[-1]
class WaybackError(Exception):
pass
-class WaybackClient:
- def __init__(self, cdx_client=None, **kwargs):
+class WaybackContentError(Exception):
+ pass
+
+
+class PetaboxError(Exception):
+ pass
+
+
+class NoCaptureError(Exception):
+ pass
+
+
+class WaybackClient:
+ def __init__(self, cdx_client: Optional[CdxApiClient] = None, **kwargs):
if cdx_client:
self.cdx_client = cdx_client
else:
self.cdx_client = CdxApiClient()
# /serve/ instead of /download/ doesn't record view count
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ # this *does* want to be http://, not https://
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/')
+ self.petabox_webdata_secret = kwargs.get(
+ "petabox_webdata_secret",
+ os.environ.get("PETABOX_WEBDATA_SECRET"),
+ )
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix", "https://archive.org/serve/")
self.rstore = None
+ self.max_redirects = 25
+ self.wayback_endpoint = "https://web.archive.org/web/"
+ self.replay_headers = {
+ "User-Agent": "Mozilla/5.0 sandcrawler.WaybackClient",
+ }
+ self.http_session = requests_retry_session()
+ self.record_http_session = requests_retry_session(
+ status_forcelist=[],
+ )
+
+ def fetch_petabox(
+ self, csize: int, offset: int, warc_path: str, resolve_revisit: bool = True
+ ) -> WarcResource:
+ """
+ Fetches wayback resource directly from petabox using WARC path/offset/csize.
+
+ If there is a problem with petabox, raises a PetaboxError.
+ If resource doesn't exist, would raise a KeyError (TODO).
+
+ The body is only returned if the record is success (HTTP 200 or
+ equivalent). Otherwise only the status and header info is returned.
+
+ WarcResource object (namedtuple) contains fields:
+ - status_code: int
+ - location: eg, for redirects
+ - body: raw bytes
+
+ resolve_revist does what it sounds like: tries following a revisit
+ record by looking up CDX API and then another fetch. Refuses to recurse
+ more than one hop (eg, won't follow a chain of revisits).
- def fetch_warc_content(self, warc_path, offset, c_size):
+ Requires (and uses) a secret token.
+ """
+ if not self.petabox_webdata_secret:
+ raise Exception("WaybackClient needs petabox secret to do direct WARC fetches")
+ if "/" not in warc_path:
+ raise ValueError(
+ "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path)
+ )
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory3(
+ webdata_secret=self.petabox_webdata_secret,
+ )
+ )
+ assert self.rstore
try:
- gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
+ # print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
+ gwb_record = self.rstore.load_resource(warc_uri, offset, csize)
except wayback.exception.ResourceUnavailable:
- raise WaybackError("failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (ResourceUnavailable)"
+ )
+ except wayback.exception.InvalidResource:
+ print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
+ raise WaybackContentError(
+ "failed to load file contents from wayback/petabox (InvalidResource)"
+ )
+ except urllib3.exceptions.ReadTimeoutError as rte:
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (ReadTimeoutError: {})".format(
+ rte
+ )
+ )
except ValueError as ve:
- raise WaybackError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (ValueError: {})".format(ve)
+ )
except EOFError as eofe:
- raise WaybackError("failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)
+ )
except TypeError as te:
- raise WaybackError("failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ raise PetaboxError(
+ "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ )
+ )
+ except Exception as e:
+ if "while decompressing data: invalid block type" in str(e):
+ raise PetaboxError(
+ "decompression error fetching WARC record; usually due to bad alexa ARC files"
+ )
+ else:
+ raise e
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
- if gwb_record.get_status()[0] != 200:
- raise WaybackError("archived HTTP response (WARC) was not 200: {}".format(gwb_record.get_status()[0]))
+ try:
+ status_code = gwb_record.get_status()[0]
+ except http.client.HTTPException:
+ raise WaybackContentError("too many HTTP headers (in wayback fetch)")
+ location = gwb_record.get_location() or None
+
+ if (
+ status_code is None
+ and gwb_record.target_uri.startswith(b"ftp://")
+ and not gwb_record.is_revisit()
+ ):
+ # TODO: some additional verification here?
+ status_code = 226
+
+ body = None
+ revisit_cdx = None
+ if gwb_record.is_revisit():
+ if not resolve_revisit:
+ raise WaybackContentError("found revisit record, but won't resolve (loop?)")
+ revisit_uri, revisit_dt = gwb_record.refers_to
+ if not (revisit_uri and revisit_dt):
+ raise WaybackContentError(
+ "revisit record missing URI and/or DT: warc:{} offset:{}".format(
+ warc_path, offset
+ )
+ )
+ # convert revisit_dt
+ # len("2018-07-24T11:56:49"), or with "Z"
+ assert len(revisit_dt) in (19, 20)
+ if type(revisit_uri) is bytes:
+ revisit_uri = revisit_uri.decode("utf-8")
+ if type(revisit_dt) is bytes:
+ revisit_dt = revisit_dt.decode("utf-8")
+ revisit_dt = (
+ revisit_dt.replace("-", "").replace(":", "").replace("T", "").replace("Z", "")
+ )
+ assert len(revisit_dt) == 14
+ try:
+ revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt)
+ body = self.fetch_petabox_body(
+ csize=revisit_cdx.warc_csize,
+ offset=revisit_cdx.warc_offset,
+ warc_path=revisit_cdx.warc_path,
+ resolve_revisit=False,
+ expected_status_code=revisit_cdx.status_code,
+ )
+ except KeyError as ke:
+ raise WaybackError("Revist resolution failed: {}".format(ke))
+ elif status_code in (200, 226):
+ try:
+ body = gwb_record.open_raw_content().read()
+ except IncompleteRead as ire:
+ raise WaybackError(
+ "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ )
+ )
+ elif status_code is None:
+ raise WaybackContentError("got a None status_code in (W)ARC record")
+ return WarcResource(
+ status_code=status_code,
+ location=location,
+ body=body,
+ revisit_cdx=revisit_cdx,
+ )
+
+ def fetch_petabox_body(
+ self,
+ csize: int,
+ offset: int,
+ warc_path: str,
+ resolve_revisit: bool = True,
+ expected_status_code: Optional[int] = None,
+ ) -> bytes:
+ """
+ Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize.
+
+ Returns bytes. Raises KeyError if resource wasn't an HTTP 200.
+
+ Thin helper around fetch_petabox()
+ """
+ resource = self.fetch_petabox(
+ csize=csize,
+ offset=offset,
+ warc_path=warc_path,
+ resolve_revisit=resolve_revisit,
+ )
+
+ if expected_status_code:
+ if expected_status_code != resource.status_code:
+ raise KeyError(
+ "archived HTTP response (WARC) was not {}: {}".format(
+ expected_status_code,
+ resource.status_code,
+ )
+ )
+ elif resource.status_code not in (200, 226):
+ raise KeyError(
+ "archived HTTP response (WARC) was not 200: {}".format(resource.status_code)
+ )
+
+ return resource.body
+
+ def fetch_replay_body(
+ self, url: str, datetime: str, cdx_sha1hex: Optional[str] = None
+ ) -> bytes:
+ """
+ Fetches an HTTP 200 record from wayback via the replay interface
+ (web.archive.org) instead of petabox.
+
+ Intended for use with SPN2 requests, where request body has not ended
+ up in petabox yet.
+
+ If cdx_sha1hex is passed, will try to verify fetched body. Note that
+ this check *won't work* in many cases, due to CDX hash being of
+ compressed transfer data, not the uncompressed final content bytes.
+
+ TODO: could instead try to verify that we got the expected replay body
+ using... new X-Archive headers?
+ """
+
+ # defensively check datetime format
+ assert len(datetime) == 14
+ assert datetime.isdigit()
try:
- raw_content = gwb_record.open_raw_content().read()
- except IncompleteRead as ire:
- raise WaybackError("failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
- return raw_content
+ resp = self.record_http_session.get(
+ self.wayback_endpoint + datetime + "id_/" + url,
+ allow_redirects=False,
+ headers=self.replay_headers,
+ )
+ except requests.exceptions.TooManyRedirects:
+ raise WaybackContentError("redirect loop (wayback replay fetch)")
+ except requests.exceptions.ConnectionError:
+ raise WaybackContentError("ConnectionError (wayback replay fetch)")
+ except requests.exceptions.ChunkedEncodingError:
+ raise WaybackError("ChunkedEncodingError (wayback replay fetch)")
+ except UnicodeDecodeError:
+ raise WaybackContentError(
+ "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(
+ url
+ )
+ )
- def fetch_url_datetime(self, url, datetime):
- cdx_row = self.cdx_client.lookup(url, datetime)
- return self.fetch_warc_content(
- cdx_row['warc_path'],
- cdx_row['warc_offset'],
- cdx_row['warc_csize'])
+ # defensively check that this is actually correct replay based on headers
+ if "X-Archive-Src" not in resp.headers:
+ # check if this was an error first
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ raise WaybackError(str(e))
+ # otherwise, a weird case (200/redirect but no Src header
+ raise WaybackError("replay fetch didn't return X-Archive-Src in headers")
+ if datetime not in resp.url:
+ raise WaybackError(
+ "didn't get exact reply (redirect?) datetime:{} got:{}".format(
+ datetime, resp.url
+ )
+ )
+
+ if cdx_sha1hex:
+ # verify that body matches CDX hash
+ # TODO: don't need *all* these hashes, just sha1
+ file_meta = gen_file_metadata(resp.content)
+ if cdx_sha1hex != file_meta["sha1hex"]:
+ print(
+ " REPLAY MISMATCH: cdx:{} replay:{}".format(
+ cdx_sha1hex, file_meta["sha1hex"]
+ ),
+ file=sys.stderr,
+ )
+ raise WaybackContentError(
+ "replay fetch body didn't match CDX hash cdx:{} body:{}".format(
+ cdx_sha1hex, file_meta["sha1hex"]
+ ),
+ )
+ return resp.content
+
+ def fetch_replay_redirect(self, url: str, datetime: str) -> Optional[str]:
+ """
+ Fetches an HTTP 3xx redirect Location from wayback via the replay interface
+ (web.archive.org) instead of petabox.
+
+ Intended for use with SPN2 requests, where request body has not ended
+ up in petabox yet. For example, re-ingesting a base_url which was
+ recently crawler by SPNv2, where we are doing ingest via wayback path.
+
+ Returns None if response is found, but couldn't find redirect.
+ """
+
+ # defensively check datetime format
+ assert len(datetime) == 14
+ assert datetime.isdigit()
+
+ try:
+ # when fetching via `id_`, it is possible to get a 5xx error which
+ # is either a wayback error, or an actual replay of an upstream 5xx
+ # error. the exception control flow here is tweaked, and a
+ # different HTTP session is used, to try and differentiate between
+ # the two cases
+ resp = None
+ resp = self.record_http_session.get(
+ self.wayback_endpoint + datetime + "id_/" + url,
+ allow_redirects=False,
+ headers=self.replay_headers,
+ )
+ resp.raise_for_status()
+ except requests.exceptions.TooManyRedirects:
+ raise WaybackContentError("redirect loop (wayback replay fetch)")
+ except UnicodeDecodeError:
+ raise WaybackContentError(
+ "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(
+ url
+ )
+ )
+ except Exception as e:
+ if resp is not None and "X-Archive-Src" in resp.headers:
+ raise WaybackContentError(
+ f"expected redirect record but got captured HTTP status: {resp.status_code}"
+ )
+ raise WaybackError(str(e))
+
+ # defensively check that this is actually correct replay based on headers
+ # previously check for "X-Archive-Redirect-Reason" here
+ if (
+ "X-Archive-Src" not in resp.headers
+ and "X-Archive-Redirect-Reason" not in resp.headers
+ ):
+ raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers")
+ if datetime not in resp.url:
+ raise WaybackError(
+ "didn't get exact reply (redirect?) datetime:{} got:{}".format(
+ datetime, resp.url
+ )
+ )
+
+ redirect_url = resp.headers.get("Location")
+ # eg, https://web.archive.org/web/20200111003923id_/https://dx.doi.org/10.17504/protocols.io.y2gfybw
+ # print(redirect_url, file=sys.stderr)
+ if redirect_url and redirect_url.startswith("https://web.archive.org/web/"):
+ redirect_url = "/".join(redirect_url.split("/")[5:])
+ # print(redirect_url, file=sys.stderr)
+ if redirect_url and redirect_url.startswith("http"):
+ redirect_url = clean_url(redirect_url)
+ return redirect_url
+ else:
+ return None
+
+ def lookup_resource(
+ self,
+ start_url: str,
+ best_mimetype: Optional[str] = None,
+ closest: Union[str, datetime.datetime, None] = None,
+ ) -> ResourceResult:
+ """
+ Looks in wayback for a resource starting at the URL, following any
+ redirects. Returns a ResourceResult object, which may indicate a
+ failure to fetch the resource.
+
+ Only raises exceptions on remote service failure or unexpected
+ problems.
+
+ In a for loop:
+
+ lookup "best" CDX
+ redirect status code?
+ fetch wayback
+ continue
+ success (200)?
+ fetch wayback
+ return success
+ bad (other status)?
+ return failure
+
+ got to end?
+ return failure; too many redirects
+ """
+ next_url = start_url
+ urls_seen = [start_url]
+ for i in range(self.max_redirects + 1):
+ print(" URL: {}".format(next_url), file=sys.stderr)
+ next_row: Optional[CdxRow] = self.cdx_client.lookup_best(
+ next_url, best_mimetype=best_mimetype, closest=closest
+ )
+ # print(next_row, file=sys.stderr)
+ if not next_row:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="no-capture",
+ terminal_url=next_url,
+ terminal_dt=None,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+
+ cdx_row: CdxRow = next_row
+
+ # first try straight-forward redirect situation
+ if cdx_row.mimetype == "warc/revisit" and "/" in cdx_row.warc_path:
+ resource = self.fetch_petabox(
+ csize=cdx_row.warc_csize,
+ offset=cdx_row.warc_offset,
+ warc_path=cdx_row.warc_path,
+ )
+ if resource.revisit_cdx and resource.revisit_cdx.status_code in (200, 226):
+ return ResourceResult(
+ start_url=start_url,
+ hit=True,
+ status="success",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=resource.revisit_cdx.status_code,
+ body=resource.body,
+ cdx=cdx_row,
+ revisit_cdx=resource.revisit_cdx,
+ )
+ # else, continue processing with revisit record
+
+ if cdx_row.status_code in (200, 226):
+ revisit_cdx = None
+ final_cdx: Union[CdxRow, CdxPartial] = cdx_row
+ if "/" in cdx_row.warc_path:
+ resource = self.fetch_petabox(
+ csize=cdx_row.warc_csize,
+ offset=cdx_row.warc_offset,
+ warc_path=cdx_row.warc_path,
+ )
+ body = resource.body
+ revisit_cdx = resource.revisit_cdx
+ else:
+ body = self.fetch_replay_body(
+ url=cdx_row.url,
+ datetime=cdx_row.datetime,
+ )
+ final_cdx = cdx_partial_from_row(cdx_row)
+ return ResourceResult(
+ start_url=start_url,
+ hit=True,
+ status="success",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=final_cdx,
+ revisit_cdx=revisit_cdx,
+ )
+ elif 300 <= (cdx_row.status_code or 0) < 400:
+ if "/" in cdx_row.warc_path:
+ resource = self.fetch_petabox(
+ csize=cdx_row.warc_csize,
+ offset=cdx_row.warc_offset,
+ warc_path=cdx_row.warc_path,
+ resolve_revisit=False,
+ )
+ assert 300 <= resource.status_code < 400
+ if not resource.location:
+ print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="bad-redirect",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=None,
+ cdx=cdx_row,
+ revisit_cdx=None,
+ )
+ if "://" not in resource.location:
+ next_url = urllib.parse.urljoin(next_url, resource.location)
+ else:
+ next_url = resource.location
+ if next_url:
+ next_url = clean_url(next_url)
+ else:
+ redirect_url = self.fetch_replay_redirect(
+ url=cdx_row.url,
+ datetime=cdx_row.datetime,
+ )
+ if redirect_url:
+ redirect_url = clean_url(redirect_url)
+ if redirect_url:
+ next_url = redirect_url
+ else:
+ print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="bad-redirect",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=None,
+ cdx=cdx_row,
+ revisit_cdx=None,
+ )
+ if next_url in urls_seen:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="redirect-loop",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=None,
+ cdx=cdx_row,
+ revisit_cdx=None,
+ )
+ urls_seen.append(next_url)
+ continue
+ else:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="terminal-bad-status",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=None,
+ cdx=cdx_row,
+ revisit_cdx=None,
+ )
+
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="redirects-exceeded",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=None,
+ cdx=cdx_row,
+ revisit_cdx=None,
+ )
class SavePageNowError(Exception):
pass
+
+class SavePageNowBackoffError(SandcrawlerBackoffError):
+ pass
+
+
+SavePageNowResult = namedtuple(
+ "SavePageNowResult",
+ [
+ "success",
+ "status",
+ "job_id",
+ "request_url",
+ "terminal_url",
+ "terminal_dt",
+ "resources",
+ ],
+)
+
+
class SavePageNowClient:
+ def __init__(self, v2endpoint: str = "https://web.archive.org/save", **kwargs):
+ self.ia_access_key = kwargs.get("ia_access_key", os.environ.get("IA_ACCESS_KEY"))
+ self.ia_secret_key = kwargs.get("ia_secret_key", os.environ.get("IA_SECRET_KEY"))
+ self.v2endpoint = v2endpoint
+ self.v2_session = requests_retry_session(
+ retries=5, backoff_factor=3, status_forcelist=[502, 504]
+ )
+ self.v2_session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 sandcrawler.SavePageNowClient",
+ "Accept": "application/json",
+ "Authorization": "LOW {}:{}".format(self.ia_access_key, self.ia_secret_key),
+ }
+ )
- def __init__(self, cdx_client=None, endpoint="https://web.archive.org/save/"):
- if cdx_client:
- self.cdx_client = cdx_client
+ # 3 minutes total
+ self.poll_count = 60
+ self.poll_seconds = 3.0
+
+ self.spn_cdx_retry_sec = kwargs.get("spn_cdx_retry_sec", 9.0)
+
+ # these are special-case web domains for which we want SPN2 to not run
+ # a headless browser (brozzler), but instead simply run wget.
+ # the motivation could be to work around browser issues, or in the
+ # future possibly to increase download efficiency (wget/fetch being
+ # faster than browser fetch)
+ self.simple_get_domains = [
+ # direct PDF links
+ "://arxiv.org/pdf/",
+ "://europepmc.org/backend/ptpmcrender.fcgi",
+ "://pdfs.semanticscholar.org/",
+ "://res.mdpi.com/",
+ # platform sites
+ "://zenodo.org/",
+ "://figshare.org/",
+ "://springernature.figshare.com/",
+ # popular simple cloud storage or direct links
+ "://s3-eu-west-1.amazonaws.com/",
+ ]
+
+ def save_url_now_v2(
+ self,
+ request_url: str,
+ force_simple_get: Optional[int] = None,
+ capture_outlinks: int = 0,
+ ) -> SavePageNowResult:
+ """
+ Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
+ at all, or raises an exception if there was an error with SPN itself.
+
+ If SPN2 was unable to fetch the remote content, `success` will be
+ false and status will be indicated.
+
+ SavePageNowResult fields:
+ - success: boolean if SPN
+ - status: "success" or an error message/type
+ - job_id: returned by API
+ - request_url: url we asked to fetch
+ - terminal_url: final primary resource (after any redirects)
+ - terminal_dt: wayback timestamp of final capture
+ - resources: list of all URLs captured
+
+ TODO: parse SPN error codes (status string) and handle better. Eg,
+ non-200 remote statuses, invalid hosts/URLs, timeouts, backoff, etc.
+ """
+ if capture_outlinks:
+ print(" capturing outlinks!", file=sys.stderr)
+ if not (self.ia_access_key and self.ia_secret_key):
+ raise Exception("SPN2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)")
+ if request_url.startswith("ftp://"):
+ return SavePageNowResult(
+ False,
+ "spn2-no-ftp",
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
+ if force_simple_get is None:
+ force_simple_get = 0
+ for domain in self.simple_get_domains:
+ if domain in request_url:
+ force_simple_get = 1
+ break
+
+ # check if SPNv2 user has capacity available
+ resp = self.v2_session.get(f"{self.v2endpoint}/status/user")
+ if resp.status_code == 429:
+ raise SavePageNowBackoffError(
+ f"SPNv2 availability API status_code: {resp.status_code}"
+ )
+ elif resp.status_code != 200:
+ raise SavePageNowError(f"SPN2 availability status_code: {resp.status_code}")
+ resp.raise_for_status()
+ status_user = resp.json()
+ if status_user["available"] <= 1:
+ print(f"SPNv2 user slots not available: {resp.text}", file=sys.stderr)
+ raise SavePageNowBackoffError(
+ "SPNv2 availability: {}, url: {}".format(status_user, request_url)
+ )
+
+ req_data = {
+ "url": request_url,
+ "capture_all": 1,
+ "if_not_archived_within": "1d",
+ "skip_first_archive": 1,
+ "js_behavior_timeout": 0,
+ # NOTE: not set explicitly to 0/false because of a bug in SPNv2 API
+ # implementation
+ # "capture_screenshot": 0,
+ # "outlinks_availability": 0,
+ }
+ if force_simple_get:
+ req_data["force_get"] = force_simple_get
+ if capture_outlinks:
+ req_data["capture_outlinks"] = capture_outlinks
+ try:
+ resp = self.v2_session.post(
+ self.v2endpoint,
+ data=req_data,
+ )
+ except requests.exceptions.ConnectionError:
+ raise SavePageNowError(f"SPN2 TCP connection error {request_url=}")
+
+ if resp.status_code == 429:
+ raise SavePageNowBackoffError(
+ "status_code: {}, url: {}".format(resp.status_code, request_url)
+ )
+ elif resp.status_code != 200:
+ raise SavePageNowError(
+ "SPN2 status_code: {}, url: {}".format(resp.status_code, request_url)
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+
+ if (
+ resp_json
+ and "message" in resp_json
+ and "You have already reached the limit of active sessions" in resp_json["message"]
+ ):
+ raise SavePageNowBackoffError(resp_json["message"])
+ elif (
+ resp_json
+ and "message" in resp_json
+ and "The same snapshot had been made" in resp_json["message"]
+ ):
+ return SavePageNowResult(
+ False,
+ "spn2-recent-capture",
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
+ elif resp_json.get("status") == "error":
+ return SavePageNowResult(
+ False,
+ resp_json.get("status_ext") or resp_json["status"],
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
+ elif not resp_json or "job_id" not in resp_json or not resp_json["job_id"]:
+ raise SavePageNowError(
+ "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json)
+ )
+
+ job_id = resp_json["job_id"]
+ print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr)
+ time.sleep(0.1)
+
+ # poll until complete
+ final_json = None
+ for i in range(self.poll_count):
+ resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, job_id))
+ try:
+ resp.raise_for_status()
+ except Exception:
+ raise SavePageNowError(resp.content)
+ status = resp.json()["status"]
+ if status == "pending":
+ time.sleep(self.poll_seconds)
+ elif status in ("success", "error"):
+ final_json = resp.json()
+ break
+ else:
+ raise SavePageNowError(
+ "Unknown SPN2 status:{} url:{}".format(status, request_url)
+ )
+
+ if not final_json:
+ raise SavePageNowError("SPN2 timed out (polling count exceeded)")
+
+ # if there was a recent crawl of same URL, fetch the status of that
+ # crawl to get correct datetime
+ if final_json.get("original_job_id"):
+ print(
+ f" SPN recent capture: {job_id} -> {final_json['original_job_id']}",
+ file=sys.stderr,
+ )
+ resp = self.v2_session.get(
+ "{}/status/{}".format(self.v2endpoint, final_json["original_job_id"])
+ )
+ try:
+ resp.raise_for_status()
+ except Exception:
+ raise SavePageNowError(resp.content)
+ final_json = resp.json()
+
+ # print(final_json, file=sys.stderr)
+
+ if final_json["status"] == "success":
+ if final_json.get("original_url").startswith("/"):
+ print(
+ f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}",
+ file=sys.stderr,
+ )
+ return SavePageNowResult(
+ True,
+ "success",
+ job_id,
+ request_url,
+ final_json["original_url"],
+ final_json["timestamp"],
+ final_json.get("resources") or None,
+ )
else:
- self.cdx_client = CdxApiClient()
- self.endpoint = endpoint
+ if final_json["status"] == "pending":
+ final_json["status"] = "error:pending"
+ return SavePageNowResult(
+ False,
+ final_json.get("status_ext") or final_json["status"],
+ job_id,
+ request_url,
+ None,
+ None,
+ None,
+ )
- def save_url_now(self, url):
+ def crawl_resource(
+ self,
+ start_url: str,
+ wayback_client: WaybackClient,
+ force_simple_get: Optional[int] = None,
+ ) -> ResourceResult:
"""
- Returns a tuple (cdx, blob) on success, or raises an error on non-success.
+ Runs a SPN2 crawl, then fetches body.
- XXX: handle redirects?
+ There is a delay between SPN2 crawls and WARC upload to petabox, so we
+ need to fetch the body via wayback replay instead of petabox
+ range-request.
"""
- resp = requests.get(self.endpoint + url)
- if resp.status_code != 200:
- raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url))
- body = resp.content
- cdx = self.cdx_client.lookup_latest(url)
- return (cdx, body)
+ # HACK: capture CNKI domains with outlinks (for COVID-19 crawling)
+ if "gzbd.cnki.net/" in start_url:
+ spn_result = self.save_url_now_v2(
+ start_url, force_simple_get=force_simple_get, capture_outlinks=1
+ )
+ else:
+ spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get)
+
+ if not spn_result.success:
+ status = spn_result.status
+ if status in (
+ "error:invalid-url",
+ "error:not-found",
+ "error:invalid-host-resolution",
+ "error:gateway-timeout",
+ "error:too-many-redirects",
+ "error:read-timeout",
+ ):
+ status = status.replace("error:", "")
+ elif status in ("error:no-access", "error:forbidden"):
+ status = "forbidden"
+ elif status == "error:user-session-limit":
+ raise SavePageNowBackoffError("SPNv2 user-session-limit")
+ elif status == "error:internal-server-error":
+ status = "remote-server-error"
+ elif status.startswith("error:"):
+ status = "spn2-" + status
+ # despite other errors, call these a failure (so we don't retry)
+ if spn_result.terminal_url and (
+ spn_result.terminal_url.endswith("/cookieAbsent")
+ or spn_result.terminal_url.endswith("cookieSet=1")
+ ):
+ status = "blocked-cookie"
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status=status,
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+ # print(spn_result, file=sys.stderr)
+
+ # detect partial URL response (aka, success, but missing full URL)
+ if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith("/"):
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="spn2-success-partial-url",
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+
+ # don't try to CDX fetch for this common cookie block terminal
+ if spn_result.terminal_url.endswith(
+ "/cookieAbsent"
+ ) or spn_result.terminal_url.endswith("cookieSet=1"):
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="blocked-cookie",
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+
+ cdx_row: Optional[CdxRow] = None
+ # hack to work around elsevier weirdness
+ if "://pdf.sciencedirectassets.com/" in spn_result.request_url:
+ elsevier_pdf_cdx = wayback_client.cdx_client.lookup_best(
+ spn_result.request_url,
+ best_mimetype="application/pdf",
+ )
+ if elsevier_pdf_cdx and elsevier_pdf_cdx.mimetype == "application/pdf":
+ print(" Trying pdf.sciencedirectassets.com hack!", file=sys.stderr)
+ cdx_row = elsevier_pdf_cdx
+ else:
+ print(" Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
+ # print(elsevier_pdf_cdx, file=sys.stderr)
+
+ if not cdx_row:
+ # lookup exact
+ try:
+ filter_status_code = None
+ if spn_result.terminal_url.startswith("ftp://"):
+ filter_status_code = 226
+ cdx_row = wayback_client.cdx_client.fetch(
+ url=spn_result.terminal_url,
+ datetime=spn_result.terminal_dt,
+ filter_status_code=filter_status_code,
+ retry_sleep=self.spn_cdx_retry_sec,
+ )
+ # sometimes there are fuzzy http/https self-redirects with the
+ # same SURT; try to work around that
+ if cdx_row.status_code >= 300 and cdx_row.status_code < 400:
+ cdx_row = wayback_client.cdx_client.fetch(
+ url=spn_result.terminal_url,
+ datetime=spn_result.terminal_dt,
+ filter_status_code=200,
+ retry_sleep=self.spn_cdx_retry_sec,
+ )
+ except KeyError as ke:
+ print(" CDX KeyError: {}".format(ke), file=sys.stderr)
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="spn2-cdx-lookup-failure",
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+
+ # print(cdx_row, file=sys.stderr)
+
+ revisit_cdx = None
+ final_cdx: Union[CdxRow, CdxPartial] = cdx_row
+ if "/" in cdx_row.warc_path:
+ # Usually can't do this kind of direct fetch because CDX result is recent/live
+ resource = wayback_client.fetch_petabox(
+ csize=cdx_row.warc_csize,
+ offset=cdx_row.warc_offset,
+ warc_path=cdx_row.warc_path,
+ )
+ body = resource.body
+ if resource.revisit_cdx:
+ assert resource.revisit_cdx.sha1hex == cdx_row.sha1hex
+ revisit_cdx = resource.revisit_cdx
+ else:
+ # note: currently not trying to verify cdx_row.sha1hex
+ try:
+ body = wayback_client.fetch_replay_body(
+ url=cdx_row.url,
+ datetime=cdx_row.datetime,
+ )
+ except (WaybackError, WaybackContentError):
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="spn2-wayback-error",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+ # warc_path etc will change, so strip them out
+ final_cdx = cdx_partial_from_row(cdx_row)
+
+ assert cdx_row.status_code
+ if cdx_row.status_code in (200, 226):
+ return ResourceResult(
+ start_url=start_url,
+ hit=True,
+ status="success",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=final_cdx,
+ revisit_cdx=revisit_cdx,
+ )
+ else:
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="terminal-bad-status",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=body,
+ cdx=final_cdx,
+ revisit_cdx=revisit_cdx,
+ )
+
+
+def fix_transfer_encoding(
+ file_meta: dict, resource: ResourceResult
+) -> Tuple[dict, ResourceResult]:
+ if (
+ resource.body
+ and file_meta["mimetype"] == "application/gzip"
+ and resource.cdx
+ and resource.cdx.mimetype != "application/gzip"
+ ):
+ print(
+ " transfer encoding not stripped: {}".format(resource.cdx.mimetype),
+ file=sys.stderr,
+ )
+ inner_body = gzip.decompress(resource.body)
+ if not inner_body:
+ raise Exception("null body inside transfer encoding")
+ inner_resource = ResourceResult(
+ body=inner_body,
+ # copy all other fields
+ start_url=resource.start_url,
+ hit=resource.hit,
+ status=resource.status,
+ terminal_url=resource.terminal_url,
+ terminal_dt=resource.terminal_dt,
+ terminal_status_code=resource.terminal_status_code,
+ cdx=resource.cdx,
+ revisit_cdx=resource.revisit_cdx,
+ )
+ inner_file_meta = gen_file_metadata(inner_resource.body)
+ return (inner_file_meta, inner_resource)
+ else:
+ return (file_meta, resource)
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
new file mode 100644
index 0000000..03277f8
--- /dev/null
+++ b/python/sandcrawler/ingest_file.py
@@ -0,0 +1,925 @@
+import json
+import sys
+import time
+import xml.etree.ElementTree
+from http.server import BaseHTTPRequestHandler
+from typing import Any, Dict, List, Optional
+
+from selectolax.parser import HTMLParser
+
+from sandcrawler.db import SandcrawlerPostgrestClient
+from sandcrawler.grobid import GrobidClient
+from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_metadata import (
+ html_extract_biblio,
+ html_extract_resources,
+ load_adblock_rules,
+)
+from sandcrawler.ia import (
+ CdxApiError,
+ NoCaptureError,
+ PetaboxError,
+ ResourceResult,
+ SavePageNowBackoffError,
+ SavePageNowClient,
+ SavePageNowError,
+ WaybackClient,
+ WaybackContentError,
+ WaybackError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.ingest_html import (
+ WebResource,
+ fetch_html_resources,
+ html_extract_body_teixml,
+ html_guess_platform,
+ html_guess_scope,
+ quick_fetch_html_resources,
+)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.pdfextract import PdfExtractResult, process_pdf
+from sandcrawler.workers import SandcrawlerWorker
+from sandcrawler.xml import xml_reserialize
+
+MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
+
+
+class IngestFileWorker(SandcrawlerWorker):
+ """
+ High level flow is to look in history first, then go to live web if
+ resource not found. Following redirects is treated as "fetching a
+ resource". Current version fetches a single resource; if it isn't a hit
+ but is an HTML 200, treats it as a landing page, tries to extract
+ fulltext link, then fetches that resource.
+
+ process(request, key=None) -> response
+ Does all the things!
+
+ Check existing processing (short circuit):
+
+ check_existing_ingest(base_url) -> ingest_file_result or none
+ process_existing(result) -> response
+ try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_file_hit()
+
+ Fetch resource:
+
+ find_resource(url) -> ResourceResult
+
+ Process resource:
+
+ process_file_hit(ResourceResult) -> response
+ process_grobid(ResourceResult)
+ """
+
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
+ super().__init__()
+
+ self.sink = sink
+
+ if kwargs.get("wayback_client"):
+ self.wayback_client: WaybackClient = kwargs["wayback_client"]
+ else:
+ self.wayback_client = WaybackClient()
+
+ if kwargs.get("spn_client"):
+ self.spn_client: SavePageNowClient = kwargs["spn_client"]
+ else:
+ self.spn_client = SavePageNowClient(
+ spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0)
+ )
+
+ if kwargs.get("grobid_client"):
+ self.grobid_client: GrobidClient = kwargs["grobid_client"]
+ else:
+ self.grobid_client = GrobidClient()
+
+ if kwargs.get("pgrest_client"):
+ self.pgrest_client: SandcrawlerPostgrestClient = kwargs["pgrest_client"]
+ else:
+ self.pgrest_client = SandcrawlerPostgrestClient()
+
+ self.grobid_sink = kwargs.get("grobid_sink")
+ self.thumbnail_sink = kwargs.get("thumbnail_sink")
+ self.pdftext_sink = kwargs.get("pdftext_sink")
+ self.xmldoc_sink = kwargs.get("xmldoc_sink")
+ self.htmlteixml_sink = kwargs.get("htmlteixml_sink")
+ self.max_hops = 8
+
+ self.try_existing_ingest = kwargs.get("try_existing_ingest", False)
+ self.try_existing_grobid = kwargs.get("try_existing_grobid", True)
+ self.try_existing_pdfextract = kwargs.get("try_existing_pdfextract", True)
+ self.try_wayback = kwargs.get("try_wayback", True)
+ self.try_spn2 = kwargs.get("try_spn2", True)
+ self.html_quick_mode = kwargs.get("html_quick_mode", False)
+ self.adblock_rules = load_adblock_rules()
+ self.max_html_resources = 200
+
+ self.base_url_blocklist = [
+ "://localhost/",
+ "://127.0.0.1/",
+ # robot blocking / rate-limited
+ "://hkvalidate.perfdrive.com/",
+ "://ieeexplore.ieee.org/",
+ # temporary, until we implement specific fetch and 'petabox' output
+ "://archive.org/",
+ "://www.archive.org/",
+ "://web.archive.org/web/",
+ # out of scope
+ "://openlibrary.org/",
+ "://www.openlibrary.org/",
+ "://fatcat.wiki/",
+ "://scholar.archive.org/",
+ "://orcid.org/",
+ # Domain squats
+ "://bartandjones.com",
+ "://ijretm.com",
+ "://ijrcemas.com",
+ "://jist.net.in",
+ "://croisements-revue.org",
+ # all stubs/previews, not full papers
+ "://page-one.live.cf.public.springer.com",
+ # large datasets-only (no PDF expected)
+ "plutof.ut.ee/",
+ "www.gbif.org/",
+ "doi.pangaea.de/",
+ "www.plate-archive.org/",
+ "://doi.org/10.25642/ipk/gbis/",
+ "://apex.ipk-gatersleben.de/",
+ "fao.org/glis/",
+ # Historical non-paper content:
+ "dhz.uni-passau.de/", # newspapers
+ "digital.ucd.ie/", # ireland national historical
+ # DOI prefixes
+ "doi.org/10.2307/", # JSTOR; slow and many redirects
+ "doi.org/10.18730/", # fao.org: database entry
+ "doi.org/10.15468/", # gbif.org: database entry
+ "doi.org/10.48550/", # arxiv.org: redundant with direct ingest
+ # deprecated domain (doesn't redirect correctly)
+ "://edoc.mpg.de/",
+ # bogus/spam PDFs
+ "://isiarticles.com/",
+ ]
+
+ self.wall_blocklist = [
+ # loginwall
+ "://profile.thieme.de/HTML/sso/ejournals/login.htm",
+ "://login.bepress.com/",
+ "?SAMLRequest=",
+ "://osapublishing.org/captcha/",
+ "/password-login",
+ "://gateway.isiknowledge.com/",
+ "/login?TARGET=",
+ "jstage.jst.go.jp/sblogin",
+ "://acw.elsevier.com/SSOCore",
+ "://acw.sciencedirect.com/SSOCore",
+ "/login?source=",
+ ]
+
+ self.cookie_blocklist = [
+ "/cookieAbsent",
+ "cookieSet=1",
+ "error=cookies_not_supported",
+ # SPNv2 seems to work (not end up here), but heritrix fails
+ "://secure.jbs.elsevierhealth.com/",
+ ]
+
+ self.src_valid_mimetypes = [
+ "text/x-tex",
+ "application/gzip",
+ "application/x-bzip",
+ "application/x-bzip2",
+ "application/zip",
+ "application/x-tar",
+ "application/msword",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ]
+
+ self.component_valid_mimetypes = [
+ "image/jpeg",
+ "image/tiff",
+ "image/png",
+ "image/gif",
+ "audio/mpeg",
+ "video/mp4",
+ "video/mpeg",
+ "text/plain",
+ "text/csv",
+ "text/x-r-source", # dataverse
+ "text/tab-separated-values", # dataverse
+ "text/x-rst", # dataverse
+ "application/x-rlang-transport", # dataverse
+ "application/json",
+ "application/xml",
+ "application/pdf",
+ "application/gzip",
+ "application/x-bzip",
+ "application/x-bzip2",
+ "application/zip ",
+ "application/x-rar ",
+ "application/x-7z-compressed",
+ "application/x-tar",
+ "application/vnd.ms-powerpoint",
+ "application/vnd.ms-excel",
+ "application/msword",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ ]
+
+ def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
+ """
+ Check in sandcrawler-db (postgres) to see if we have already ingested
+ this URL (ingest file result table).
+
+ Returns existing row *if* found *and* we should use it, otherwise None.
+
+ Looks at existing ingest results and makes a decision based on, eg,
+ status and timestamp.
+ """
+ if not self.try_existing_ingest:
+ return None
+ existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url)
+ # TODO: filter on more flags?
+ if existing and existing["hit"] is True:
+ return existing
+ else:
+ return None
+
+ def find_resource(
+ self, url: str, best_mimetype: Optional[str] = None, force_recrawl: bool = False
+ ) -> Optional[ResourceResult]:
+ """
+ Looks in wayback for a resource starting at the URL, following any
+ redirects. If a hit isn't found, try crawling with SPN.
+ """
+ via = "none"
+ resource = None
+
+ if url.startswith("http://web.archive.org/web/") or url.startswith(
+ "https://web.archive.org/web/"
+ ):
+ raise NotImplementedError("handling direct wayback links not supported yet")
+
+ if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
+ raise NotImplementedError("fetching from archive.org not implemented yet")
+
+ if self.try_wayback and not force_recrawl:
+ via = "wayback"
+ resource = self.wayback_client.lookup_resource(url, best_mimetype)
+
+ # check for "soft 404" conditions, where we should retry with live SPNv2
+ soft404 = False
+ # NOTE: these are often not working with SPNv2 either, so disabling. If
+ # we really want to try again, should do force-recrawl
+ # if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'):
+ # soft404 = True
+
+ old_failure = False
+ if (
+ resource
+ and not resource.hit
+ and resource.terminal_dt
+ and resource.terminal_dt < "20190000000000"
+ ):
+ old_failure = True
+
+ if self.try_spn2 and (
+ resource is None
+ or (resource and resource.status == "no-capture")
+ or soft404
+ or old_failure
+ ):
+ via = "spn2"
+ resource = self.spn_client.crawl_resource(url, self.wayback_client)
+ print(
+ "[FETCH {:>6}] {} {}".format(
+ via, (resource and resource.status), (resource and resource.terminal_url) or url
+ ),
+ file=sys.stderr,
+ )
+ return resource
+
+ def process_existing(self, request: dict, result_row: dict) -> dict:
+ """
+ If we have an existing ingest file result, do any database fetches or
+ additional processing necessary to return a result.
+ """
+ raise NotImplementedError("process_existing() not tested or safe yet")
+ assert result_row["hit"]
+ existing_file_meta = self.pgrest_client.get_file_meta(result_row["terminal_sha1hex"])
+ existing_grobid = self.pgrest_client.get_grobid(result_row["terminal_sha1hex"])
+ existing_cdx = self.pgrest_client.get_cdx(
+ result_row["terminal_url"], result_row["terminal_dt"]
+ )
+ if not (existing_file_meta and existing_grobid and existing_cdx):
+ raise NotImplementedError("partially-exsiting records not implemented yet")
+ result = {
+ "hit": result_row["hit"],
+ "status": "existing",
+ "request": request,
+ "grobid": existing_grobid,
+ "file_meta": existing_file_meta,
+ "cdx": existing_cdx,
+ "terminal": {
+ "terminal_url": result_row["terminal_url"],
+ "terminal_dt": result_row["terminal_dt"],
+ "terminal_status_code": result_row["terminal_status_code"],
+ "terminal_sha1hex": result_row["terminal_sha1hex"],
+ },
+ }
+ return result
+
+ def process_file_hit(
+ self, ingest_type: str, resource: ResourceResult, file_meta: dict
+ ) -> dict:
+ """
+ Run all the necessary processing for a new/fresh ingest hit.
+ """
+ if (
+ ingest_type in ["dataset-file", "component"]
+ and file_meta["mimetype"] == "application/pdf"
+ ):
+ ingest_type = "pdf"
+ if ingest_type == "pdf":
+ return {
+ "grobid": self.process_grobid(resource, file_meta),
+ "pdf_meta": self.process_pdfextract(resource, file_meta),
+ }
+ elif ingest_type == "xml":
+ return {
+ "xml_meta": self.process_xml(resource, file_meta),
+ }
+ elif ingest_type == "html":
+ html_info = self.process_html(resource, file_meta)
+ # if there is no html_biblio, don't clobber anything possibly extracted earlier
+ if "html_biblio" in html_info and not html_info["html_biblio"]:
+ html_info.pop("html_biblio")
+ return html_info
+ elif ingest_type == "src":
+ return {}
+ elif ingest_type == "component":
+ return {}
+ elif ingest_type == "dataset-file":
+ return {}
+ else:
+ raise NotImplementedError(f"process {ingest_type} hit")
+
+ def process_grobid(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Submits to resource body to GROBID for processing.
+
+ TODO: By default checks sandcrawler-db for an existing row first, then
+ decide if we should re-process
+ """
+ if self.try_existing_grobid:
+ existing = self.pgrest_client.get_grobid(file_meta["sha1hex"])
+ if existing:
+ # grobid_timestamp = existing.get("grobid_timestamp") or None
+ # status
+ grobid_version = existing.get("grobid_version") or None
+ if grobid_version and grobid_version.startswith("0.7"):
+ print("found existing GROBID result", file=sys.stderr)
+ return existing
+
+ # Need to actually processes
+ result = self.grobid_client.process_fulltext(resource.body)
+ if self.grobid_sink:
+ # extra fields for GROBID kafka messages
+ result["file_meta"] = file_meta
+ result["key"] = result["file_meta"]["sha1hex"]
+ self.grobid_sink.push_record(result.copy())
+ if result["status"] == "success":
+ metadata = self.grobid_client.metadata(result)
+ if metadata:
+ result["metadata"] = metadata
+ result["fatcat_release"] = metadata.pop("fatcat_release", None)
+ result["grobid_version"] = metadata.pop("grobid_version", None)
+ result.pop("tei_xml", None)
+ result.pop("file_meta", None)
+ result.pop("key", None)
+ return result
+
+ def process_pdfextract(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Extracts thumbnail and pdf_meta info from PDF.
+
+ By default checks sandcrawler-db for an existing row first, then decide
+ if we should re-process.
+
+ TODO: difference between Kafka schema and SQL/postgrest schema
+ """
+ if self.try_existing_pdfextract:
+ existing = self.pgrest_client.get_pdf_meta(file_meta["sha1hex"])
+ if existing:
+ print("found existing pdf_meta result", file=sys.stderr)
+ result = PdfExtractResult.from_pdf_meta_dict(existing)
+ return result.to_pdftext_dict()
+
+ # Need to actually processes
+ result = process_pdf(resource.body)
+ assert result.sha1hex == file_meta["sha1hex"]
+ assert result.file_meta is not None
+ assert result.file_meta["sha1hex"] == file_meta["sha1hex"]
+ if self.thumbnail_sink and result.page0_thumbnail is not None:
+ self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
+ if self.pdftext_sink:
+ self.pdftext_sink.push_record(result.to_pdftext_dict(), key=result.sha1hex)
+ result.page0_thumbnail = None
+ result.text = None
+ result.file_meta = None
+ return result.to_pdftext_dict()
+
+ def process_xml(self, resource: ResourceResult, file_meta: dict) -> dict:
+ """
+ Simply publishes to Kafka topic.
+
+ In the future, could extract other metadata here (like body word
+ count), or attempting to fetch sub-resources.
+ """
+ if self.xmldoc_sink and file_meta["mimetype"] == "application/jats+xml":
+ try:
+ jats_xml = xml_reserialize(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="xml-parse-error")
+ msg = dict(
+ sha1hex=file_meta["sha1hex"],
+ status="success",
+ jats_xml=jats_xml,
+ )
+ self.xmldoc_sink.push_record(msg, key=file_meta["sha1hex"])
+ return dict(status="success")
+
+ def process_html(self, resource: ResourceResult, file_meta: dict) -> dict:
+
+ assert resource.body
+ try:
+ html_doc = HTMLParser(resource.body)
+ except ValueError:
+ return dict(status="html-selectolax-error")
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ assert html_biblio
+ try:
+ html_body = html_extract_body_teixml(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="html-teixml-error")
+ html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
+ html_scope = html_guess_scope(
+ resource.terminal_url, html_doc, html_biblio, html_body.get("word_count")
+ )
+ html_biblio_dict = json.loads(html_biblio.json(exclude_none=True))
+
+ if html_scope in ("blocked-captcha", "blocked-cookie", "blocked-forbidden"):
+ return dict(
+ status=html_scope,
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ )
+ elif html_scope not in (
+ "article-fulltext",
+ "unknown",
+ ):
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="wrong-scope",
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ raw_resources = html_extract_resources(
+ resource.terminal_url, html_doc, self.adblock_rules
+ )
+ if len(raw_resources) > self.max_html_resources:
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="too-many-resources",
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ if self.htmlteixml_sink and html_body["status"] == "success":
+ self.htmlteixml_sink.push_record(html_body, key=file_meta["sha1hex"])
+
+ html_body.pop("tei_xml", None)
+
+ partial_result = dict(
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_body=html_body,
+ )
+
+ when = parse_cdx_datetime(resource.cdx.datetime)
+ full_resources: List[WebResource] = []
+
+ try:
+ if self.html_quick_mode:
+ print(" WARN: running quick CDX-only fetches", file=sys.stderr)
+ full_resources = quick_fetch_html_resources(
+ raw_resources, self.wayback_client.cdx_client, when
+ )
+ else:
+ full_resources = fetch_html_resources(raw_resources, self.wayback_client, when)
+ except PetaboxError as e:
+ partial_result["status"] = "petabox-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except CdxApiError as e:
+ partial_result["status"] = "cdx-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except WaybackError as e:
+ partial_result["status"] = "wayback-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except WaybackContentError as e:
+ partial_result["status"] = "wayback-content-error"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+ except NoCaptureError as e:
+ partial_result["status"] = "html-resource-no-capture"
+ partial_result["error_message"] = str(e)[:1600]
+ return partial_result
+
+ info = dict(
+ html_body=html_body,
+ html_biblio=html_biblio_dict,
+ scope=html_scope,
+ platform=html_platform,
+ html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources],
+ )
+ if html_scope == "unknown":
+ info["status"] = "unknown-scope"
+ return info
+
+ def timeout_response(self, task: dict) -> dict:
+ print("[TIMEOUT]", file=sys.stderr)
+ return dict(
+ request=task,
+ hit=False,
+ status="timeout",
+ error_message="ingest worker internal timeout",
+ )
+
+ def want(self, request: dict) -> bool:
+ if not request.get("ingest_type") in ("file", "pdf", "xml", "html", "src", "component"):
+ return False
+ return True
+
+ def process(self, request: dict, key: Any = None) -> dict:
+ return self.process_file(request, key=key)
+
+ def process_file(self, request: dict, key: Any = None) -> dict:
+
+ # old backwards compatibility
+ if request.get("ingest_type") == "file":
+ request["ingest_type"] = "pdf"
+
+ ingest_type = request.get("ingest_type")
+ if ingest_type not in ("pdf", "xml", "html", "src", "component"):
+ raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
+
+ # parse/clean URL
+ # note that we pass through the original/raw URL, and that is what gets
+ # persisted in database table
+ base_url = clean_url(request["base_url"])
+
+ force_recrawl = bool(request.get("force_recrawl", False))
+
+ for block in self.base_url_blocklist:
+ if block in base_url:
+ print("[SKIP {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+ return dict(request=request, hit=False, status="skip-url-blocklist")
+
+ print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+
+ best_mimetype = None
+ if ingest_type == "pdf":
+ best_mimetype = "application/pdf"
+ elif ingest_type == "xml":
+ best_mimetype = "text/xml"
+ elif ingest_type == "html":
+ best_mimetype = "text/html"
+ elif ingest_type == "src":
+ best_mimetype = "application/gzip"
+
+ existing = self.check_existing_ingest(ingest_type, base_url)
+ if existing:
+ return self.process_existing(request, existing)
+
+ result: Dict[str, Any] = dict(request=request, hit=False)
+
+ next_url = base_url
+ hops = [base_url]
+
+ while len(hops) <= self.max_hops:
+
+ result["hops"] = hops
+
+ # check against blocklist again on each hop
+ for block in self.base_url_blocklist:
+ if block in next_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ # also check against known loginwall patterns
+ for block in self.wall_blocklist:
+ if block in next_url:
+ # TODO: blocked-wall instead of skip-wall
+ result["status"] = "skip-wall"
+ return result
+
+ # check for popular cookie blocking URL patterns. On successful SPN
+ # crawls, shouldn't see these redirect URLs
+ for pattern in self.cookie_blocklist:
+ if pattern in next_url:
+ result["status"] = "blocked-cookie"
+ return result
+
+ try:
+ resource = self.find_resource(
+ next_url, best_mimetype, force_recrawl=force_recrawl
+ )
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except SavePageNowBackoffError as e:
+ result["status"] = "spn2-backoff"
+ result["error_message"] = str(e)[:1600]
+ # small sleep as a slow-down
+ time.sleep(2.0)
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
+
+ assert resource
+
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
+ if resource.terminal_url not in result["hops"]:
+ result["hops"].append(resource.terminal_url)
+
+ if not resource.hit:
+ result["status"] = resource.status
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.base_url_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.cookie_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "blocked-cookie"
+ return result
+
+ if not resource.body:
+ result["status"] = "empty-blob"
+ return result
+
+ if len(resource.body) > MAX_BODY_SIZE_BYTES:
+ result["status"] = "body-too-large"
+ return result
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, resource = fix_transfer_encoding(file_meta, resource)
+ except Exception as e:
+ result["status"] = "bad-gzip-encoding"
+ result["error_message"] = str(e)
+ return result
+
+ if not resource.body or file_meta["size_bytes"] == 0:
+ result["status"] = "empty-blob"
+ return result
+
+ # here we split based on ingest type to try and extract a next hop
+ html_ish_resource = bool(
+ "html" in file_meta["mimetype"]
+ or "xhtml" in file_meta["mimetype"] # matches "application/xhtml+xml"
+ or "application/xml" in file_meta["mimetype"]
+ or "text/xml" in file_meta["mimetype"]
+ )
+ html_biblio = None
+ html_doc = None
+ if html_ish_resource and resource.body:
+ try:
+ html_doc = HTMLParser(resource.body)
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ if html_biblio:
+ if "html_biblio" not in result and html_biblio.title:
+ result["html_biblio"] = json.loads(
+ html_biblio.json(exclude_none=True)
+ )
+ # print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+ except ValueError:
+ pass
+
+ if ingest_type == "pdf" and html_ish_resource:
+
+ # the new style of URL extraction (already computed)
+ if html_biblio and html_biblio.pdf_fulltext_url:
+ fulltext_url = dict(
+ pdf_url=html_biblio.pdf_fulltext_url,
+ technique="html_biblio",
+ )
+ else:
+ fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
+
+ result["extract_next_hop"] = fulltext_url
+ if not fulltext_url:
+ # check if we hit a paywall/loginwall
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+ # else, just failed to find link
+ result["status"] = "no-pdf-link"
+ return result
+ next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or ""
+ assert next_url
+ next_url = clean_url(next_url)
+ print(
+ "[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ fulltext_url.get("technique"),
+ next_url,
+ ),
+ file=sys.stderr,
+ )
+ if next_url in hops:
+ result["status"] = "link-loop"
+ result["error_message"] = "repeated: {}".format(next_url)
+ return result
+ hops.append(next_url)
+ continue
+ elif (
+ ingest_type in ("xml", "html", "component")
+ and html_ish_resource
+ and html_biblio
+ ):
+ # NOTE: src_fulltext_url is not a thing
+ next_url_found = None
+ if ingest_type == "xml" and html_biblio.xml_fulltext_url:
+ next_url_found = html_biblio.xml_fulltext_url
+ elif ingest_type == "html" and html_biblio.html_fulltext_url:
+ next_url_found = html_biblio.html_fulltext_url
+ elif ingest_type == "component" and html_biblio.component_url:
+ next_url_found = html_biblio.component_url
+
+ if next_url_found:
+ next_url = next_url_found
+ technique = "html_biblio"
+ print(
+ "[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ technique,
+ next_url,
+ ),
+ file=sys.stderr,
+ )
+ if next_url in hops:
+ if ingest_type == "html":
+ # for HTML ingest, we don't count this as a link-loop
+ break
+ result["status"] = "link-loop"
+ result["error_message"] = "repeated: {}".format(next_url)
+ return result
+ hops.append(next_url)
+ continue
+
+ # default is to NOT keep hopping
+ break
+
+ if len(hops) >= self.max_hops:
+ result["status"] = "max-hops-exceeded"
+ return result
+
+ # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
+ assert resource
+ assert resource.hit is True
+ assert resource.terminal_status_code in (200, 226)
+
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ "terminal_sha1hex": file_meta["sha1hex"],
+ }
+
+ result["file_meta"] = file_meta
+ result["cdx"] = cdx_to_dict(resource.cdx)
+ if resource.revisit_cdx:
+ result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)
+
+ # check if we hit a paywall/loginwall before trying mimetype
+ for block in self.wall_blocklist:
+ if block in resource.terminal_url:
+ result["status"] = "blocked-wall"
+ return result
+
+ if ingest_type == "pdf":
+ if file_meta["mimetype"] != "application/pdf":
+ result["status"] = "wrong-mimetype" # formerly: "other-mimetype"
+ return result
+ elif ingest_type == "xml":
+ if file_meta["mimetype"] not in (
+ "application/xml",
+ "text/xml",
+ "application/jats+xml",
+ ):
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "html":
+ if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"):
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "src":
+ if file_meta["mimetype"] not in self.src_valid_mimetypes:
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "component":
+ if file_meta["mimetype"] not in self.component_valid_mimetypes:
+ result["status"] = "wrong-mimetype"
+ return result
+ else:
+ raise NotImplementedError()
+
+ info = self.process_file_hit(ingest_type, resource, file_meta)
+ result.update(info)
+
+ # check if processing turned up an error
+ if info.get("status") not in ("success", None):
+ result["status"] = info["status"]
+ return result
+
+ result["status"] = "success"
+ result["hit"] = True
+ if ingest_type == "pdf":
+ print(
+ "[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format(
+ ingest_type,
+ result.get("file_meta", {}).get("sha1hex"),
+ result.get("grobid", {}).get("status_code"),
+ result.get("pdf_meta", {}).get("status"),
+ ),
+ file=sys.stderr,
+ )
+ else:
+ print(
+ "[SUCCESS {:>5}] sha1:{}".format(
+ ingest_type,
+ result.get("file_meta", {}).get("sha1hex"),
+ ),
+ file=sys.stderr,
+ )
+ return result
+
+
+class IngestFileRequestHandler(BaseHTTPRequestHandler):
+ def do_POST(self) -> None:
+ if self.path != "/ingest":
+ self.send_response(404)
+ self.end_headers()
+ self.wfile.write(b"404: Not Found")
+ return
+ length = int(self.headers.get("content-length"))
+ request = json.loads(self.rfile.read(length).decode("utf-8"))
+ print("Got request: {}".format(request))
+ ingester = IngestFileWorker()
+ result = ingester.process(request)
+ self.send_response(200)
+ self.end_headers()
+ self.wfile.write(json.dumps(result).encode("utf8"))
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
new file mode 100644
index 0000000..3acbece
--- /dev/null
+++ b/python/sandcrawler/ingest_fileset.py
@@ -0,0 +1,516 @@
+import json
+import sys
+import time
+from typing import Any, Dict, Optional
+
+import requests
+from selectolax.parser import HTMLParser
+
+from sandcrawler.fileset_platforms import (
+ ArchiveOrgHelper,
+ DataverseHelper,
+ FigshareHelper,
+ ZenodoHelper,
+)
+from sandcrawler.fileset_strategies import (
+ ArchiveorgFilesetStrategy,
+ ArchiveorgFileStrategy,
+ WebFilesetStrategy,
+ WebFileStrategy,
+)
+from sandcrawler.fileset_types import (
+ IngestStrategy,
+ PlatformRestrictedError,
+ PlatformScopeError,
+)
+from sandcrawler.html_metadata import html_extract_biblio
+from sandcrawler.ia import (
+ CdxApiError,
+ PetaboxError,
+ SavePageNowError,
+ WaybackContentError,
+ WaybackError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.ingest_file import IngestFileWorker
+from sandcrawler.misc import clean_url, gen_file_metadata
+from sandcrawler.workers import SandcrawlerWorker
+
+MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
+
+
+class IngestFilesetWorker(IngestFileWorker):
+ """
+ General process is:
+
+ 1. crawl base_url, and use request and landing page resource (eg, HTML) to
+ determine platform being targeted
+ 2. use platform-specific helper to fetch metadata about the work, including
+ a manifest of files, and selection of an "ingest strategy" and any
+ required context
+ 3. then use strategy-specific helper to archive files from manifest (first
+ checking to see if content has been archived already)
+ 4. summarize status
+ """
+
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
+ super().__init__(sink=None, **kwargs)
+
+ self.try_spn2 = kwargs.get("try_spn2", True)
+ self.sink = sink
+ self.dataset_platform_helpers = {
+ "dataverse": DataverseHelper(),
+ "figshare": FigshareHelper(),
+ "zenodo": ZenodoHelper(),
+ "archiveorg": ArchiveOrgHelper(),
+ }
+ self.dataset_strategy_archivers = {
+ IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
+ IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(),
+ IngestStrategy.WebFileset: WebFilesetStrategy(try_spn2=self.try_spn2),
+ IngestStrategy.WebFile: WebFileStrategy(try_spn2=self.try_spn2),
+ }
+
+ self.max_total_size = kwargs.get("max_total_size", 64 * 1024 * 1024 * 1024)
+ self.max_file_count = kwargs.get("max_file_count", 200)
+ self.ingest_file_result_sink = kwargs.get("ingest_file_result_sink")
+ self.ingest_file_result_stdout = kwargs.get("ingest_file_result_stdout", False)
+
+ def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
+ """
+ Same as file version, but uses fileset result table
+ """
+ if not self.try_existing_ingest:
+ return None
+ existing = self.pgrest_client.get_ingest_fileset_platform(ingest_type, base_url)
+ # TODO: filter on more flags?
+ if existing and existing["hit"] is True:
+ return existing
+ else:
+ return None
+
+ def process_existing(self, request: dict, result_row: dict) -> dict:
+ """
+ If we have an existing ingest fileset result, do any database fetches
+ or additional processing necessary to return a result.
+ """
+ raise NotImplementedError("process_existing() not tested or safe yet")
+
+ def want(self, request: dict) -> bool:
+ if not request.get("ingest_type") in ("dataset",):
+ return False
+ return True
+
+ def fetch_resource_iteratively(
+ self, ingest_type: str, base_url: str, force_recrawl: bool
+ ) -> dict:
+ """
+ This is copypasta from process_file(), should probably refactor.
+ """
+
+ result: Dict[str, Any] = dict(hit=False)
+ result["hops"] = [base_url]
+ next_url = base_url
+
+ # check against blocklist
+ for block in self.base_url_blocklist:
+ # NOTE: hack to not skip archive.org content
+ if "archive.org" in block:
+ continue
+ if block in next_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ try:
+ resource = self.find_resource(next_url, force_recrawl=force_recrawl)
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
+
+ html_biblio = None
+ if resource:
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
+ if resource.terminal_url not in result["hops"]:
+ result["hops"].append(resource.terminal_url)
+
+ if not resource.hit:
+ result["status"] = resource.status
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.base_url_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "skip-url-blocklist"
+ return result
+
+ if resource.terminal_url:
+ for pattern in self.cookie_blocklist:
+ if pattern in resource.terminal_url:
+ result["status"] = "blocked-cookie"
+ return result
+
+ if not resource.body:
+ result["status"] = "empty-blob"
+ return result
+
+ if len(resource.body) > MAX_BODY_SIZE_BYTES:
+ result["status"] = "body-too-large"
+ return result
+
+ file_meta = gen_file_metadata(resource.body)
+ try:
+ file_meta, resource = fix_transfer_encoding(file_meta, resource)
+ except Exception as e:
+ result["status"] = "bad-gzip-encoding"
+ result["error_message"] = str(e)
+ return result
+
+ if not resource.body or file_meta["size_bytes"] == 0:
+ result["status"] = "empty-blob"
+ return result
+
+ # here we split based on ingest type to try and extract a next hop
+ html_ish_resource = bool(
+ "html" in file_meta["mimetype"]
+ or "xhtml" in file_meta["mimetype"] # matches "application/xhtml+xml"
+ or "application/xml" in file_meta["mimetype"]
+ or "text/xml" in file_meta["mimetype"]
+ )
+ html_biblio = None
+ html_doc = None
+ if html_ish_resource and resource.body:
+ try:
+ html_doc = HTMLParser(resource.body)
+ html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ if html_biblio:
+ if "html_biblio" not in result and html_biblio.title:
+ result["html_biblio"] = json.loads(
+ html_biblio.json(exclude_none=True)
+ )
+ # print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+ except ValueError:
+ pass
+
+ # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
+ assert resource
+ assert resource.hit is True
+ assert resource.terminal_status_code in (200, 226)
+
+ if resource.terminal_url:
+ result["terminal"] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ "terminal_sha1hex": file_meta["sha1hex"],
+ }
+
+ result["file_meta"] = file_meta
+ result["cdx"] = cdx_to_dict(resource.cdx)
+ if resource.revisit_cdx:
+ result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)
+
+ if ingest_type == "pdf":
+ if file_meta["mimetype"] != "application/pdf":
+ result["status"] = "wrong-mimetype" # formerly: "other-mimetype"
+ return result
+ elif ingest_type == "xml":
+ if file_meta["mimetype"] not in (
+ "application/xml",
+ "text/xml",
+ "application/jats+xml",
+ ):
+ result["status"] = "wrong-mimetype"
+ return result
+ elif ingest_type == "html":
+ if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"):
+ result["status"] = "wrong-mimetype"
+ return result
+ else:
+ # eg, datasets, components, etc
+ pass
+
+ result["_html_biblio"] = html_biblio
+ result["_resource"] = resource
+ return result
+
+ def process(self, request: dict, key: Any = None) -> dict:
+
+ ingest_type = request.get("ingest_type")
+ if ingest_type not in ("dataset",):
+ raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
+
+ # parse/clean URL
+ # note that we pass through the original/raw URL, and that is what gets
+ # persisted in database table
+ base_url = clean_url(request["base_url"])
+
+ force_recrawl = bool(request.get("force_recrawl", False))
+
+ print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+
+ # TODO: "existing" check against file and/or fileset ingest result table
+ # existing = self.check_existing_ingest(ingest_type, base_url)
+ # if existing:
+ # return self.process_existing(request, existing)
+
+ result = self.fetch_resource_iteratively(
+ ingest_type, base_url, force_recrawl=force_recrawl
+ )
+ result["request"] = request
+ if result.get("status") is not None:
+ result["request"] = request
+ return result
+
+ html_biblio = result.pop("_html_biblio")
+ resource = result.pop("_resource")
+
+ # 1. Determine `platform`, which may involve resolving redirects and crawling a landing page.
+
+ # TODO: could involve html_guess_platform() here?
+
+ # determine platform
+ platform_helper = None
+ for (helper_name, helper) in self.dataset_platform_helpers.items():
+ if helper.match_request(request, resource, html_biblio):
+ platform_helper = helper
+ break
+
+ if not platform_helper:
+ result["status"] = "no-platform-match"
+ return result
+
+ # 2. Use platform-specific methods to fetch manifest metadata and decide on an `ingest_strategy`.
+ try:
+ dataset_meta = platform_helper.process_request(request, resource, html_biblio)
+ except PlatformScopeError as e:
+ result["status"] = "platform-scope"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PlatformRestrictedError as e:
+ result["status"] = "platform-restricted"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except NotImplementedError as e:
+ result["status"] = "not-implemented"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except requests.exceptions.HTTPError as e:
+ result["error_message"] = str(e)[:1600]
+ if e.response.status_code == 404:
+ result["status"] = "platform-404"
+ result["error_message"] = str(e)[:1600]
+ return result
+ else:
+ result["status"] = "platform-http-error"
+ return result
+ except requests.exceptions.RequestException as e:
+ result["error_message"] = str(e)[:1600]
+ result["status"] = "platform-error"
+ return result
+
+ # print(dataset_meta, file=sys.stderr)
+ platform = dataset_meta.platform_name
+ result["platform_name"] = dataset_meta.platform_name
+ result["platform_domain"] = dataset_meta.platform_domain
+ result["platform_id"] = dataset_meta.platform_id
+ result["platform_base_url"] = dataset_meta.web_base_url
+ result["archiveorg_item_name"] = dataset_meta.archiveorg_item_name
+
+ if not dataset_meta.manifest:
+ result["status"] = "empty-manifest"
+ return result
+
+ # these will get confirmed/updated after ingest
+ result["manifest"] = [m.dict(exclude_none=True) for m in dataset_meta.manifest]
+ result["file_count"] = len(dataset_meta.manifest)
+ result["total_size"] = sum([m.size for m in dataset_meta.manifest if m.size])
+
+ if result["total_size"] > self.max_total_size:
+ result["status"] = "too-large-size"
+ return result
+ if result["file_count"] > self.max_file_count:
+ # hard max, to prevent downstream breakage
+ if result["file_count"] > 10 * 1000:
+ result["manifest"] = result["manifest"][: self.max_file_count]
+ result["status"] = "too-many-files"
+ return result
+
+ ingest_strategy = platform_helper.chose_strategy(dataset_meta)
+ result["ingest_strategy"] = ingest_strategy
+ print(
+ f"[PLATFORM {platform}] id={dataset_meta.platform_id} file_count={result['file_count']} total_size={result['total_size']} strategy={ingest_strategy}",
+ file=sys.stderr,
+ )
+
+ strategy_helper = self.dataset_strategy_archivers.get(ingest_strategy)
+ if not strategy_helper:
+ result["status"] = "no-strategy-helper"
+ return result
+
+ # 3. Use strategy-specific methods to archive all files in platform manifest, and verify manifest metadata.
+ try:
+ archive_result = strategy_helper.process(dataset_meta)
+ except SavePageNowError as e:
+ result["status"] = "spn2-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except PetaboxError as e:
+ result["status"] = "petabox-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except CdxApiError as e:
+ result["status"] = "cdx-error"
+ result["error_message"] = str(e)[:1600]
+ # add a sleep in cdx-error path as a slow-down
+ time.sleep(2.0)
+ return result
+ except WaybackError as e:
+ result["status"] = "wayback-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+ except WaybackContentError as e:
+ result["status"] = "wayback-content-error"
+ result["error_message"] = str(e)[:1600]
+ return result
+
+ # 4. Summarize status and return structured result metadata.
+ result["status"] = archive_result.status
+ result["manifest"] = [m.dict(exclude_none=True) for m in archive_result.manifest]
+
+ if ingest_strategy.endswith("-fileset-bundle"):
+ result["fileset_bundle"] = dict()
+ if archive_result.bundle_file_meta:
+ result["fileset_bundle"]["file_meta"] = archive_result.bundle_file_meta
+ if archive_result.bundle_archiveorg_path:
+ result["fileset_bundle"][
+ "archiveorg_bundle_path"
+ ] = archive_result.bundle_archiveorg_path
+ if archive_result.bundle_resource:
+ result["fileset_bundle"]["terminal"] = dict(
+ terminal_url=archive_result.bundle_resource.terminal_url,
+ terminal_dt=archive_result.bundle_resource.terminal_dt,
+ terminal_status_code=archive_result.bundle_resource.terminal_status_code,
+ )
+ if archive_result.bundle_resource.cdx:
+ result["fileset_bundle"]["cdx"] = cdx_to_dict(
+ archive_result.bundle_resource.cdx
+ )
+ if archive_result.bundle_resource.revisit_cdx:
+ result["fileset_bundle"]["revisit_cdx"] = cdx_to_dict(
+ archive_result.bundle_resource.revisit_cdx
+ )
+
+ if ingest_strategy.endswith("-file"):
+ result["fileset_file"] = dict()
+ if archive_result.file_file_meta:
+ result["fileset_file"]["file_meta"] = (archive_result.file_file_meta,)
+ if archive_result.file_resource:
+ result["fileset_file"]["terminal"] = dict(
+ terminal_url=archive_result.file_resource.terminal_url,
+ terminal_dt=archive_result.file_resource.terminal_dt,
+ terminal_status_code=archive_result.file_resource.terminal_status_code,
+ )
+ if archive_result.file_resource.cdx:
+ result["fileset_file"]["cdx"] = cdx_to_dict(
+ archive_result.file_resource.cdx
+ )
+ if archive_result.file_resource.revisit_cdx:
+ result["fileset_file"]["revisit_cdx"] = cdx_to_dict(
+ archive_result.file_resource.revisit_cdx
+ )
+
+ if result["status"].startswith("success"):
+ # check that these are still valid
+ assert result["file_count"] == len(archive_result.manifest)
+ assert result["total_size"] == sum(
+ [m.size for m in archive_result.manifest if m.size]
+ )
+
+ if (
+ result["status"] == "success-file"
+ and archive_result.file_resource
+ and archive_result.file_file_meta
+ ):
+ file_result: Dict[str, Any] = dict(
+ hit=True,
+ status="success",
+ request=request.copy(),
+ file_meta=archive_result.file_file_meta,
+ terminal=dict(
+ terminal_url=archive_result.file_resource.terminal_url,
+ terminal_dt=archive_result.file_resource.terminal_dt,
+ terminal_status_code=archive_result.file_resource.terminal_status_code,
+ terminal_sha1hex=archive_result.file_file_meta["sha1hex"],
+ ),
+ )
+ if archive_result.file_resource.cdx:
+ file_result["cdx"] = cdx_to_dict(archive_result.file_resource.cdx)
+ if archive_result.file_resource.revisit_cdx:
+ file_result["revisit_cdx"] = cdx_to_dict(
+ archive_result.file_resource.revisit_cdx
+ )
+ file_result["request"]["ingest_type"] = request["ingest_type"] + "-file"
+ # call the super() (ingest_file) version of process_hit()
+ info = self.process_file_hit(
+ file_result["request"]["ingest_type"],
+ archive_result.file_resource,
+ archive_result.file_file_meta,
+ )
+ file_result.update(info)
+ if self.ingest_file_result_sink:
+ self.ingest_file_result_sink.push_record(result.copy())
+ elif self.ingest_file_result_stdout:
+ sys.stdout.write(json.dumps(file_result, sort_keys=True) + "\n")
+
+ if result["status"].startswith("success"):
+ result["hit"] = True
+ print(
+ "[SUCCESS {:>5}] file_count={} total_size={} strategy={}".format(
+ ingest_type,
+ result["file_count"],
+ result["total_size"],
+ ingest_strategy,
+ ),
+ file=sys.stderr,
+ )
+ else:
+ print(
+ "[FAIL {:>5}] status={} file_count={} total_size={} strategy={}".format(
+ ingest_type,
+ result["status"],
+ result["file_count"],
+ result["total_size"],
+ ingest_strategy,
+ ),
+ file=sys.stderr,
+ )
+ return result
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
new file mode 100644
index 0000000..fb42e71
--- /dev/null
+++ b/python/sandcrawler/ingest_html.py
@@ -0,0 +1,499 @@
+import argparse
+import datetime
+import json
+import sys
+import xml.etree.ElementTree as ET
+from typing import Any, List, Optional, Tuple
+
+import pydantic
+import trafilatura
+from selectolax.parser import HTMLParser
+
+from sandcrawler.html_metadata import (
+ BiblioMetadata,
+ html_extract_biblio,
+ html_extract_resources,
+ load_adblock_rules,
+)
+from sandcrawler.ia import (
+ CdxApiClient,
+ NoCaptureError,
+ WaybackClient,
+ WaybackContentError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.misc import (
+ datetime_to_cdx,
+ gen_file_metadata,
+ parse_cdx_datetime,
+ url_fuzzy_equal,
+)
+
+TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
+
+
+def html_extract_body_teixml(doc: bytes) -> dict:
+ try:
+ tei_xml = trafilatura.extract(
+ doc,
+ output_format="xmltei",
+ include_comments=False,
+ include_formatting=True,
+ )
+ except (ValueError, TypeError, Exception) as e:
+ return dict(
+ status="trafilatura-parse-error",
+ error_msg=str(e)[:1000],
+ )
+ if tei_xml:
+ body_txt = teixml_body_text(tei_xml)
+ word_count = len(body_txt.split())
+ return dict(
+ status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count
+ )
+ elif doc.startswith(
+ b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'
+ ):
+ # hack for firstmonday.org
+ return html_extract_body_teixml(doc[106:])
+ else:
+ return dict(status="empty-xml", agent=TRAFILATURA_AGENT)
+
+
+def teixml_body_text(doc_xml: str) -> str:
+ ns = {"tei": "http://www.tei-c.org/ns/1.0"}
+ tree = ET.fromstring(doc_xml)
+ body = tree.find(".//tei:body", ns)
+ if body:
+ return " ".join(body.itertext())
+ else:
+ return ""
+
+
+class WebResource(pydantic.BaseModel):
+ surt: str
+ timestamp: datetime.datetime
+ url: str
+ sha1hex: str
+ mimetype: str
+ status_code: int
+ size: Optional[int]
+ sha256hex: Optional[str]
+ resource_type: Optional[str]
+
+ class Config:
+ json_encoders = {datetime.datetime: lambda dt: dt.isoformat()}
+
+
+class IngestWebResult(pydantic.BaseModel):
+ status: str
+ hit: bool
+ error_message: Optional[str]
+ cdx: Optional[dict]
+ terminal: Optional[Any] # TODO
+ request: Optional[Any] # TODO
+ file_meta: Optional[dict]
+ html_biblio: Optional[BiblioMetadata]
+ scope: Optional[str]
+ html_body: Optional[dict]
+ html_resources: Optional[List[WebResource]]
+
+ class Config:
+ arbitrary_types_allowed = True
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat(),
+ }
+
+
+class HtmlMetaRow(pydantic.BaseModel):
+ sha1hex: str
+ status: str
+ scope: Optional[str]
+ has_teixml: bool
+ has_thumbnail: bool
+ word_count: Optional[int]
+ biblio: Optional[dict]
+ resources: Optional[List[dict]]
+
+ class Config:
+ arbitrary_types_allowed = True
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat(),
+ }
+
+ def to_sql_tuple(self) -> Tuple:
+ """
+ This is for the html_meta SQL table.
+ """
+ return (
+ self.sha1hex,
+ datetime.datetime.now(), # updated
+ self.status,
+ self.scope,
+ self.has_teixml,
+ self.has_thumbnail,
+ self.word_count,
+ (self.biblio or None) and json.dumps(self.biblio, sort_keys=True),
+ (self.resources or None) and json.dumps(self.resources, sort_keys=True),
+ )
+
+
+def quick_fetch_html_resources(
+ resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]
+) -> List[WebResource]:
+ """
+ This is the lazy version that just does a CDX lookup for each resource.
+
+ Takes a list instead of single record because we may want to circuit break
+ on failure, and may introduce concurrency internal to this function.
+ """
+
+ full = []
+ closest = when and datetime_to_cdx(when)
+ for resource in resources:
+ cdx_row = cdx_client.lookup_best(resource["url"], closest=closest)
+ if not cdx_row:
+ raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
+ if cdx_row.url != resource["url"] and not url_fuzzy_equal(cdx_row.url, resource["url"]):
+ print(
+ f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr
+ )
+ if not cdx_row.status_code:
+ # TODO: fall back to a full fetch?
+ print(" WARN: skipping revisit record", file=sys.stderr)
+ continue
+ full.append(
+ WebResource(
+ surt=cdx_row.surt,
+ timestamp=cdx_row.datetime,
+ url=cdx_row.url,
+ sha1hex=cdx_row.sha1hex,
+ mimetype=cdx_row.mimetype,
+ status_code=cdx_row.status_code,
+ size=None,
+ sha256hex=None,
+ resource_type=resource["type"],
+ )
+ )
+
+ return full
+
+
+def fetch_html_resources(
+ resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]
+) -> List[WebResource]:
+ """
+ This is the full version which fetches each resource from wayback/petabox
+ and calculates additional hashes.
+
+ Could make this concurrent in the future, eg: https://realpython.com/python-concurrency/#threading-version
+ """
+
+ full = []
+ closest = when and datetime_to_cdx(when)
+ for resource in resources:
+ wayback_resp = wayback_client.lookup_resource(resource["url"], closest=closest)
+ if not wayback_resp or wayback_resp.status != "success":
+ raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
+ # for HTML sub-resources specifically, we allow the CDX SHA1 to match
+ # either the transfer-encoded or inner (un-encoded) payload body to
+ # match. This is because of an ambiguity in the WARC specification
+ outer_file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
+ try:
+ file_meta, wayback_resp = fix_transfer_encoding(outer_file_meta, wayback_resp)
+ except Exception as e:
+ raise WaybackContentError(f"bad gzip encoding: {e}")
+ if (
+ file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
+ and outer_file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
+ ):
+ raise WaybackContentError(
+ f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url} found:{file_meta['sha1hex']} expected:{wayback_resp.cdx.sha1hex}"
+ )
+ full.append(
+ WebResource(
+ surt=wayback_resp.cdx.surt,
+ timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
+ url=wayback_resp.cdx.url,
+ sha1hex=file_meta["sha1hex"],
+ mimetype=file_meta["mimetype"],
+ status_code=wayback_resp.cdx.status_code
+ or wayback_resp.revisit_cdx.status_code,
+ size=file_meta["size_bytes"],
+ sha256hex=file_meta["sha256hex"],
+ resource_type=resource["type"],
+ )
+ )
+
+ return full
+
+
+def html_guess_platform(
+ url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
+) -> Optional[str]:
+
+ generator: Optional[str] = None
+ generator_elem = doc.css_first("meta[name='generator']")
+ if generator_elem:
+ generator = generator_elem.attrs["content"]
+ else:
+ generator_elem = doc.css_first("a[id='developedBy']")
+ if generator_elem:
+ generator = generator_elem.text()
+ if generator and "open journal systems 3" in generator.lower():
+ return "ojs3"
+ elif generator and "open journal systems" in generator.lower():
+ return "ojs"
+ elif generator and "plone" in generator.lower():
+ return "plone"
+ elif generator and "wordpress" in generator.lower():
+ return "wordpress"
+ elif generator and "blogger" in generator.lower():
+ return "blogger"
+ elif doc.css_first("body[id='pkp-common-openJournalSystems']"):
+ return "ojs"
+ else:
+ try:
+ if (
+ 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>'
+ in doc.html
+ ):
+ return "ojs"
+ if '<a href="https://www.pubpub.org">Published with' in doc.html:
+ return "pubpub"
+ if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html:
+ return "arpha"
+ if "<meta property='og:image' content='http://cms.galenos.com.tr' />" in doc.html:
+ return "galenos"
+ except UnicodeDecodeError:
+ pass
+
+ icon_elem = doc.css_first("link[type='image/x-icon']")
+ if icon_elem and "href" in icon_elem.attrs:
+ if "journalssystem.com" in icon_elem.attrs["href"]:
+ return "journalssystem.com"
+ elif "indexcopernicus.com" in icon_elem.attrs["href"]:
+ return "indexcopernicus"
+
+ if "scielo" in url:
+ return "scielo"
+
+ return None
+
+
+def html_guess_scope(
+ url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]
+) -> str:
+ """
+ This function tries to guess if an HTML document represents one of:
+
+ - article-fulltext
+ - article-abstract
+ - article-sample
+ - supplement
+ - component
+ - issue-fulltext
+ - landingpage
+ - homepage-domain
+ - blocked-paywall
+ - blocked-login
+ - blocked-captcha
+ - blocked-cookie
+ - errorpage
+ - stub
+ - other
+ - unknown
+
+ Unknown implies the page could be anything. "other" implies it is not
+ fulltext or a landing page, but could be one of the other categories.
+ """
+
+ # assert that this is a real URL
+ assert url.count("/") >= 2
+
+ # basic paywall and loginwall detection based on URL
+ if url.endswith("/cookieAbsent"):
+ return "blocked-cookie"
+ if "://page-one.live.cf.public.springer.com" in url:
+ return "article-sample"
+
+ if "scielo" in url:
+ if "sci_abstract" in url:
+ return "landingpage"
+ if "sci_arttext" in url:
+ return "article-fulltext"
+
+ if "showcaptcha.asp" in url:
+ return "blocked-captcha"
+
+ # is this the top-level URL of the domain? aka, no path?
+ if url.count("/") <= 2 or (url.count("/") == 3) and url.endswith("/"):
+ return "homepage-domain"
+
+ platform = html_guess_platform(url, doc, biblio)
+
+ if biblio:
+ if biblio.html_fulltext_url:
+ if url_fuzzy_equal(biblio.html_fulltext_url, url):
+ return "article-fulltext"
+ else:
+ return "landingpage"
+
+ # platform-specific detection
+ if platform in ("ojs", "ojs3"):
+
+ if biblio and biblio.title:
+ if word_count and word_count > 1200:
+ return "fulltext"
+ else:
+ return "landingpage"
+ else:
+ if "/article/view/" in url and word_count and word_count > 600:
+ return "fulltext"
+ return "other"
+ elif platform == "journalssystem.com":
+ if biblio and biblio.pdf_fulltext_url and word_count and word_count < 1000:
+ return "landingpage"
+
+ # more platform/publisher specific checks
+ if "karger.com/Article/Abstract" in url:
+ return "landingpage"
+ if "dergipark.gov.tr" in url and not ("download/article-file" in url):
+ return "other"
+
+ try:
+ if isinstance(doc.html, str) and "<center><h1>403 Forbidden</h1></center>" in doc.html:
+ # cloudflare block pattern
+ return "blocked-forbidden"
+ except UnicodeDecodeError:
+ pass
+
+ print(f" scope guessing: platform {platform} word count: {word_count}", file=sys.stderr)
+
+ # fallback: guess based on word count (arbitrary guesses here)
+ if word_count is not None:
+ if word_count < 20:
+ return "stub"
+ elif word_count > 500 and platform in ["wordpress", "blogger"]:
+ return "article-fulltext"
+ elif word_count > 1200:
+ return "article-fulltext"
+
+ return "unknown"
+
+
+def run_single(
+ url: str, timestamp: Optional[str] = None, quick_mode: bool = False
+) -> IngestWebResult:
+
+ adblock = load_adblock_rules()
+ wayback_client = WaybackClient()
+
+ html_resource = wayback_client.lookup_resource(url, "text/html", closest=timestamp)
+ if html_resource.status != "success":
+ return IngestWebResult(
+ status=html_resource.status,
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ )
+
+ assert html_resource.terminal_status_code == 200
+
+ file_meta = gen_file_metadata(html_resource.body)
+ file_meta, html_resource = fix_transfer_encoding(file_meta, html_resource)
+
+ if file_meta["mimetype"] not in ("text/html", "text/xml"):
+ return IngestWebResult(
+ status="wrong-mimetype",
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ )
+
+ html_doc = HTMLParser(html_resource.body)
+ html_biblio = html_extract_biblio(url, html_doc)
+ html_body = html_extract_body_teixml(html_resource.body)
+ html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get("word_count"))
+ if html_scope not in ("article-fulltext", "unknown"):
+ return IngestWebResult(
+ status="wrong-scope",
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ html_biblio=html_biblio,
+ scope=html_scope,
+ )
+
+ raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock)
+ assert len(raw_resources) <= 200
+
+ when = parse_cdx_datetime(html_resource.cdx.datetime)
+
+ full_resources: List[WebResource] = []
+ if quick_mode:
+ full_resources = quick_fetch_html_resources(
+ raw_resources, wayback_client.cdx_client, when
+ )
+ else:
+ full_resources = fetch_html_resources(raw_resources, wayback_client, when)
+
+ output = IngestWebResult(
+ status="success",
+ hit=True,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ html_body=html_body,
+ html_biblio=html_biblio,
+ scope=html_scope,
+ html_resources=full_resources,
+ )
+ return output
+
+
+def main() -> None:
+ """
+ Run this command like:
+
+ python -m sandcrawler.ingest_html
+ """
+
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ subparsers = parser.add_subparsers()
+
+ sub = subparsers.add_parser(
+ "single", help="tries to ingest a single URL, dumps result to stdout"
+ )
+ sub.set_defaults(func="run_single")
+ sub.add_argument(
+ "url",
+ help="URL to fetch",
+ type=str,
+ )
+ sub.add_argument(
+ "--timestamp",
+ help="timestamp for which to fetch document from wayback",
+ type=str,
+ )
+ sub.add_argument(
+ "--quick-mode",
+ help="don't fetch resources, only do CDX lookup",
+ action="store_true",
+ )
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ if args.func == "run_single":
+ result = run_single(args.url, args.timestamp, args.quick_mode)
+ print(result.json(indent=2, exclude_none=True))
+ else:
+ # func = getattr(wp, args.func)
+ # func()
+ raise NotImplementedError()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
new file mode 100644
index 0000000..8836515
--- /dev/null
+++ b/python/sandcrawler/minio.py
@@ -0,0 +1,118 @@
+import hashlib
+import io
+from typing import Optional, Tuple, Union
+
+import minio
+
+
+class SandcrawlerMinioClient(object):
+ def __init__(
+ self,
+ host_url: str,
+ access_key: str,
+ secret_key: str,
+ default_bucket: Optional[str] = None,
+ ):
+ """
+ host is minio connection string (host:port)
+ access and secret key are as expected
+ default_bucket can be supplied so that it doesn't need to be repeated for each function call
+
+ Example config:
+
+ host="localhost:9000",
+ access_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],
+ secret_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],
+ """
+ self.mc = minio.Minio(
+ host_url,
+ access_key=access_key,
+ secret_key=secret_key,
+ secure=False,
+ )
+ self.default_bucket = default_bucket
+
+ def _blob_path(self, folder: str, sha1hex: str, extension: str, prefix: str) -> str:
+ if not extension:
+ extension = ""
+ if not prefix:
+ prefix = ""
+ assert len(sha1hex) == 40
+ obj_path = "{}{}/{}/{}/{}{}".format(
+ prefix,
+ folder,
+ sha1hex[0:2],
+ sha1hex[2:4],
+ sha1hex,
+ extension,
+ )
+ return obj_path
+
+ def put_blob(
+ self,
+ folder: str,
+ blob: Union[str, bytes],
+ sha1hex: Optional[str] = None,
+ extension: str = "",
+ prefix: str = "",
+ bucket: Optional[str] = None,
+ ) -> Tuple[str, str]:
+ """
+ blob should be bytes
+ sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated
+ Uploads blob to path in the given bucket. Files are stored in a top-level
+ folder, then in two levels of sub-directory based on sha1, then the
+ filename is SHA1 with an optional file extension.
+ """
+ if type(blob) == str:
+ blob = blob.encode("utf-8")
+ assert type(blob) == bytes
+ if not sha1hex:
+ h = hashlib.sha1()
+ h.update(blob)
+ sha1hex = h.hexdigest()
+ obj_path = self._blob_path(folder, sha1hex, extension, prefix)
+ if not bucket:
+ bucket = self.default_bucket
+ assert bucket
+ content_type = "application/octet-stream"
+ if extension.endswith(".xml"):
+ content_type = "application/xml"
+ if extension.endswith(".png"):
+ content_type = "image/png"
+ elif extension.endswith(".jpg") or extension.endswith(".jpeg"):
+ content_type = "image/jpeg"
+ elif extension.endswith(".txt"):
+ content_type = "text/plain"
+ self.mc.put_object(
+ bucket,
+ obj_path,
+ io.BytesIO(blob),
+ len(blob),
+ content_type=content_type,
+ )
+ return (bucket, obj_path)
+
+ def get_blob(
+ self,
+ folder: str,
+ sha1hex: str,
+ extension: str = "",
+ prefix: str = "",
+ bucket: Optional[str] = None,
+ ) -> bytes:
+ """
+ sha1hex is sha1 of the blob itself
+
+ Fetched blob from the given bucket/folder, using the sandcrawler SHA1 path convention
+ """
+ obj_path = self._blob_path(folder, sha1hex, extension, prefix)
+ if not bucket:
+ bucket = self.default_bucket
+ assert bucket
+ blob = self.mc.get_object(
+ bucket,
+ obj_path,
+ )
+ # TODO: optionally verify SHA-1?
+ return blob
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 4ffc5d7..4e37036 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,17 +1,70 @@
-
import base64
-import magic
-import hashlib
import datetime
+import hashlib
+import os
+from typing import List, Optional
+
+import magic
+import requests
+import urlcanon
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+
+
+def clean_url(s: str) -> str:
+ s = s.strip()
+ parsed = urlcanon.parse_url(s)
+ if not parsed.port and parsed.colon_before_port:
+ parsed.colon_before_port = b""
+ return str(urlcanon.whatwg(parsed))
-def gen_file_metadata(blob):
+
+def url_fuzzy_equal(left: str, right: str) -> bool:
+ """
+ TODO: use proper surt library and canonicalization for this check
+ """
+ fuzzy_left = "://".join(
+ clean_url(left).replace("www.", "").replace(":80/", "/").split("://")[1:]
+ )
+ fuzzy_right = "://".join(
+ clean_url(right).replace("www.", "").replace(":80/", "/").split("://")[1:]
+ )
+ if fuzzy_left == fuzzy_right:
+ return True
+ elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/":
+ return True
+ return False
+
+
+def test_url_fuzzy_equal() -> None:
+ assert (
+ url_fuzzy_equal(
+ "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+ "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
+ )
+ is True
+ )
+
+
+def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
"""
Takes a file blob (bytestream) and returns hashes and other metadata.
Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype
"""
- assert blob
- mimetype = magic.Magic(mime=True).from_buffer(blob)
+ assert blob is not None
+ if not allow_empty:
+ assert blob
+ if len(blob) < 1024 * 1024:
+ mimetype = magic.Magic(mime=True).from_buffer(blob)
+ else:
+ mimetype = magic.Magic(mime=True).from_buffer(blob[: (1024 * 1024)])
+ if mimetype in ("application/xml", "text/xml"):
+ # crude checks for XHTML or JATS XML, using only first 1 kB of file
+ if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
+ mimetype = "application/xhtml+xml"
+ elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
+ mimetype = "application/jats+xml"
hashes = [
hashlib.sha1(),
hashlib.sha256(),
@@ -27,7 +80,50 @@ def gen_file_metadata(blob):
mimetype=mimetype,
)
-def b32_hex(s):
+
+def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
+ """
+ Variant of gen_file_metadata() which works with files on local disk
+ """
+ assert path is not None
+ mimetype = magic.Magic(mime=True).from_file(path)
+ if mimetype in ("application/xml", "text/xml"):
+ with open(path, "rb") as f:
+ blob = f.read(1024)
+ # crude checks for XHTML or JATS XML, using only first 1 kB of file
+ if (
+ b"<htm" in blob[:1024]
+ and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]
+ ):
+ mimetype = "application/xhtml+xml"
+ elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
+ mimetype = "application/jats+xml"
+ hashes = [
+ hashlib.sha1(),
+ hashlib.sha256(),
+ hashlib.md5(),
+ ]
+ size_bytes = 0
+ with open(path, "rb") as f:
+ while True:
+ chunk = f.read(1024 * 1024)
+ if not chunk:
+ break
+ size_bytes += len(chunk)
+ for h in hashes:
+ h.update(chunk)
+ if not allow_empty:
+ assert size_bytes > 0
+ return dict(
+ size_bytes=size_bytes,
+ sha1hex=hashes[0].hexdigest(),
+ sha256hex=hashes[1].hexdigest(),
+ md5hex=hashes[2].hexdigest(),
+ mimetype=mimetype,
+ )
+
+
+def b32_hex(s: str) -> str:
"""
Converts a base32-encoded SHA-1 checksum into hex-encoded
@@ -40,30 +136,45 @@ def b32_hex(s):
if len(s) == 40:
return s
raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
NORMAL_MIME = (
- 'application/pdf',
- 'application/postscript',
- 'text/html',
- 'text/xml',
+ "application/pdf",
+ "application/postscript",
+ "text/html",
+ "text/xml",
+ "application/octet-stream",
)
-def normalize_mime(raw):
- raw = raw.lower()
+
+def normalize_mime(raw: str) -> Optional[str]:
+ raw = raw.lower().strip()
for norm in NORMAL_MIME:
if raw.startswith(norm):
return norm
# Special cases
- if raw.startswith('application/xml'):
- return 'text/xml'
- if raw.startswith('application/x-pdf'):
- return 'application/pdf'
+ if raw.startswith("application/xml"):
+ return "text/xml"
+ if raw.startswith("application/x-pdf"):
+ return "application/pdf"
+ if raw in (".pdf",):
+ return "application/pdf"
+ if raw in (
+ "application/download",
+ "binary/octet-stream",
+ "unk",
+ "application/x-download",
+ "application/octetstream",
+ "application/force-download",
+ "application/unknown",
+ ):
+ return "application/octet-stream"
return None
-def test_normalize_mime():
+def test_normalize_mime() -> None:
assert normalize_mime("asdf") is None
assert normalize_mime("application/pdf") == "application/pdf"
assert normalize_mime("application/pdf+journal") == "application/pdf"
@@ -72,9 +183,11 @@ def test_normalize_mime():
assert normalize_mime("application/xml+stuff") == "text/xml"
assert normalize_mime("application/x-pdf") == "application/pdf"
assert normalize_mime("application/x-html") is None
+ assert normalize_mime("unk") == "application/octet-stream"
+ assert normalize_mime("binary/octet-stream") == "application/octet-stream"
-def parse_cdx_line(raw_cdx, normalize=True):
+def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]:
"""
This method always filters a few things out:
@@ -95,39 +208,110 @@ def parse_cdx_line(raw_cdx, normalize=True):
offset = cdx[9]
warc = cdx[10]
- if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit()
- and len(sha1b32) == 32 and dt.isdigit()):
+ if not (
+ sha1b32.isalnum()
+ and c_size.isdigit()
+ and offset.isdigit()
+ and len(sha1b32) == 32
+ and dt.isdigit()
+ ):
return None
- if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
+ if "-" in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
return None
- if mime is None or mime == '-':
+ if mime is None or mime == "-":
mime = "application/octet-stream"
if normalize:
mime = normalize_mime(mime)
sha1hex = b32_hex(sha1b32)
- http_status = int(http_status)
- c_size = int(c_size)
- offset = int(offset)
return dict(
surt=surt,
url=url,
datetime=dt,
mimetype=mime,
- http_status=http_status,
+ http_status=int(http_status),
sha1b32=sha1b32,
sha1hex=sha1hex,
- warc_csize=c_size,
- warc_offset=offset,
+ warc_csize=int(c_size),
+ warc_offset=int(offset),
warc_path=warc,
)
-def parse_cdx_datetime(dt_str):
+
+def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
+ if not dt_str:
+ return None
try:
- return datetime.strptime(dt_str, "%Y%m%d%H%M%S")
+ return datetime.datetime.strptime(dt_str, "%Y%m%d%H%M%S")
except Exception:
return None
+
+
+def test_parse_cdx_datetime() -> None:
+ assert parse_cdx_datetime("") is None
+ assert parse_cdx_datetime("asdf") is None
+ assert parse_cdx_datetime("19930203123045") is not None
+ assert parse_cdx_datetime("20201028235103") == datetime.datetime(
+ year=2020, month=10, day=28, hour=23, minute=51, second=3
+ )
+
+
+def datetime_to_cdx(dt: datetime.datetime) -> str:
+ return "%04d%02d%02d%02d%02d%02d" % (
+ dt.year,
+ dt.month,
+ dt.day,
+ dt.hour,
+ dt.minute,
+ dt.second,
+ )
+
+
+def test_datetime_to_cdx() -> None:
+ assert "20201028235103" == datetime_to_cdx(
+ datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)
+ )
+
+
+def requests_retry_session(
+ retries: int = 10,
+ backoff_factor: int = 1,
+ status_forcelist: List[int] = [500, 502, 504],
+ session: Optional[requests.Session] = None,
+) -> requests.Session:
+ """
+ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+ """
+ session = session or requests.Session()
+ retry = Retry(
+ total=retries,
+ read=retries,
+ connect=retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist,
+ )
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
+ return session
+
+
+def sanitize_fs_path(path: str) -> str:
+ """
+ From: https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python/66950540#66950540
+ """
+ # - pretending to chroot to the current directory
+ # - cancelling all redundant paths (/.. = /)
+ # - making the path relative
+ return os.path.relpath(os.path.normpath(os.path.join("/", path)), "/")
+
+
+def test_sanitize_fs_path() -> None:
+ assert sanitize_fs_path("/thing.png") == "thing.png"
+ assert sanitize_fs_path("../../thing.png") == "thing.png"
+ assert sanitize_fs_path("thing.png") == "thing.png"
+ assert sanitize_fs_path("subdir/thing.png") == "subdir/thing.png"
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
new file mode 100644
index 0000000..97d338e
--- /dev/null
+++ b/python/sandcrawler/pdfextract.py
@@ -0,0 +1,502 @@
+import datetime
+import json
+import sys
+from dataclasses import dataclass
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
+
+import poppler
+from PIL import Image
+
+from .ia import WaybackClient
+from .misc import gen_file_metadata
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
+
+# This is a hack to work around timeouts when processing certain PDFs with
+# poppler. For some reason, the usual Kafka timeout catcher isn't working on
+# these, maybe due to threading.
+BAD_PDF_SHA1HEX: List[str] = [
+ "011478a1e63a2a31eae1a93832a74cc95f220760",
+ "018dfe9824de6d2ac068ce0f7dc9961bffa1b558",
+ "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53",
+ "06061af0707298c12932516d1bb7c2b6dc443824",
+ "0641822e68c5a07538b967489fd19a1d5dc371a5",
+ "09cba9b00494d12759c50cb914f1fb7c9746f5d1",
+ "09db7c9f2efb496c974427a61e84292ae27fc702",
+ "0a1c13cb8783bbbf248b2345b9890e2410aa3f0a",
+ "0ccc6dc94f4e2d809fac8543870265c3421f3c9e",
+ "0d1c1567ea70e7b922ba88ccb868ffc7ca18e75c",
+ "10c6577a658bf6203557e2998b25ea9788f8adfe",
+ "15a720921ce30da983fcd1bfa7fe9aeeda503e41",
+ "1659881a31edc2d0e170f6bb26d32e74cc4ca387",
+ "17e679b0ec9444fff2ea4d02caec05dd2de80ec3",
+ "182749ad1db1d5e999d07f010bdcfc2978dadc88",
+ "1a17a4fc43397804830cc29021281aac2e8cf0cb",
+ "1cb166f0c0b5ffe673e6bbf6a29d77278711f253",
+ "1d04e46b6848e6479dd90fe26bb11627044fb664",
+ "1d967c95546d31edaaf0c3ef9ffcc11113a9e11a",
+ "1f90194bf0c7fff1fe1ed5fff77a934c7a1b32a0",
+ "20589d9dd0a22c8c938ad97b7f4f12648aa119fa",
+ "2195e528fa1cf5f8ae3b2adcc516896016c3411f",
+ "25ab9e6169f041be05844a9b4edd6574918af769",
+ "281de904c4642a9be4f17b9774fc0a2bdc8a90e3",
+ "2bd5322975653536550a039eb055174b2bf241b3",
+ "2fc64da736175810918fd32c94c5068b0d660bcc",
+ "32318fba9b05b2756b7362bcaa4722c92ed8d449",
+ "336833c6fc968cd0938250dfc93c032a30111cfc",
+ "362ad00bc24d650c8f11851f9e554fc560b73e7a",
+ "373f84dfab4ed47047826e604e2918a9cd6a95b2",
+ "3ac0b6e17e30d141871a0a5b127536919fe5aa19",
+ "3c8a6a708da0dc1802f5f3e5267a49b3c25e1ffe",
+ "3e5f9fb94e7314447a22f3d009419a922136177f",
+ "3fad493c940137ce703f2f570ebb504e360c6df3",
+ "40aa94602ab13e5a7d9df8c989fca4fa5c01239e",
+ "427479c94d7d0e512f898bc7ff0b6f210069f902",
+ "436c9183724f051b22c96285aa8ff1d2ba709574",
+ "43a8c0abf0386d3e3397cf5e22a884761dd63db7",
+ "445968ef735b228c08c3ff4238d99fc9f4824619",
+ "447fa6b5a90742a86429a932f6608d8e141688c0",
+ "45f014d7d631559dc7726e5c5513f1e7c91c48a9",
+ "47577ff6d6876117ca69bec60a5764f7d2c2ec70",
+ "4785181cec8944eee00ddb631a5dfc771b89bab7",
+ "47db2db2cc976429568841a0496c0ab4ed7b5977",
+ "481c0bae81873988fcc8662ba8a269e8823fdea2",
+ "4c81129904f7976a50825595a3497ea7b52579ef",
+ "4edc1402712fa6827c4501fed8042e9f4447829c",
+ "50b3c5a3122272aca69855ef06b85d0b43a76eb1",
+ "52fc9b3c5199ef395d410c7cee5961dc812e4d29",
+ "53471346019947a88c1ba141fb829375527153b0",
+ "58d9ae7dcb0a7dbbdfc58ad266030b037e9cd0ff",
+ "59cfc843ebdb1c1e5db1efc76a40f46cb3bb06f0",
+ "5ab98405b676ee81a6ca74fba51a9e4a6cff7311",
+ "5c5b45c85eff07d4302844e00ec8baa57b988c60",
+ "5e04779cbbae5ce88bb786064f756885dd6895fe",
+ "5e6a3adde9f08c276c4efd72bfacb256f2ec35d9",
+ "62247fe6b8d3ca50477cafddbe24bf63832d6674",
+ "623ff84b616383d0a3e0dd8dbce12f0b5fe9a6ac",
+ "646c4a654270606256397684204ff0f3d17be2e7",
+ "64d821d728f9a3dc944b4c03be00feea0b57e314",
+ "668b7d777203af4b261d21bf4669fc9b385062e1",
+ "689b5cb3ddef213d612363a903f10d0358ea64d2",
+ "6909f0b62d8b7835de3dec7777aad7f8ef507ee3",
+ "74e617dc95555e8ca3aadd19d0c85b71cd77d1d9",
+ "7596438d77444a7c4228bb96fa4b394ba7d7e23b",
+ "75c2662a96ccc48891228df7c85eb7d4da9dd621",
+ "771f1ca0007a6fbed5b4a434c73f524f715d33c1",
+ "776859635e9dc01d97b0582f49c814ffbcb019fb",
+ "781dafda896a9f5c30f3d0a011f79a3b79b574c4",
+ "788672c7c2bcdecf6e2f6a2177c01e60f04d9cfb",
+ "79d6cba3c6e577a0f3a3a9fe575680d38454938d",
+ "7b8b7e8e4b789579a7d2fda329db52528383a652",
+ "7c5c925cfb7c5a861b5c0a1d923308f9bedd335e",
+ "7cfc0739be9c49d94272110a0a748256bdde9be6",
+ "7daf61526ec825151f384cc1db510ca5237d5d80",
+ "7e9d846f3bf9ce15cdb991b78cc870ab8a2bed76",
+ "800e47a7ed214f7acac85cc29aa7b0f9c0e218ae",
+ "8398b211a5ec4da1195a4ba1bc29ca8c0ac40f67",
+ "859d7ec532a0bf3b52b17c7f2d8ecc58410c0aad",
+ "88edcbab1cac2d70af5870422974afc253f4f0c6",
+ "89860fc475fcb2a2d86c4544df52ec8fd5e6533f",
+ "8dcaf4ef132900dd378f7be526c884b17452713b",
+ "8e4f03c29ae1fe7227140ab4b625f375f6c00d31",
+ "8ec1a17ec19ae8ade95b9bdc837236981e83fffb",
+ "949dfb7d833da9576b2ccb9eb1ab5457469c53d3",
+ "961ec451172f373f919c593737466300e42062cb",
+ "976989fa6e447578d9ce16ec5b526f0e09d6df50",
+ "977f23723027d7052df9b49eb467e6c0b9af93ff",
+ "98b02eb70066c182c705ef4d14d8b723ad7f1fab",
+ "993ca31f6974f8387bb18dd7d38987d290da8781",
+ "9dbd05af3442e6f42d67868054751b76973f4171",
+ "a1cc781c694a48e018f4de110b58f561aa212051",
+ "a2298c137b9c8c8975bad62eea9224edb95e6952",
+ "a2671738755ab8b24775e95375dc72f1ca4e5fd6",
+ "a26f299fb97c646effeebd4c5e2968786bd0f781",
+ "a48f9b7ad627909f76d780aa4208530304ece42c",
+ "a69665d0b5d3b95f54f68406eee3ed50c67efb45",
+ "a69665d0b5d3b95f54f68406eee3ed50c67efb45",
+ "a8357c31837404f9ebd798999d546c9398ab3648",
+ "a9162b9aef5e5da0897275fede1a6cff8cc93dfc",
+ "abc9d264df446707b40d7c9f79befd0f89291e59",
+ "ad038725bf6855a79f3c768ebe93c7103d14522f",
+ "aef581bf42e76e527f5aed3b8958fd4e7a24819f",
+ "b2b66b9c7f817a20144456f99c0be805602e8597",
+ "b2d719120306b90eb8dd3580b699a61ec70556f4",
+ "b4b8e18e27f102e59b2be2d58c7b54d0a0eb457a",
+ "b5be7f409a3a2601208c5ce08cf52b9ac1094aae",
+ "b5bf8b7467fb095c90adf3b49aa1687291e4469c",
+ "b8b427e5b3d650ba9e03197f9c3917e25b878930",
+ "bad48b89b639b5b7df2c6a2d5288181fcb8b0e35",
+ "be0cda7642e9247b3ee41cd2017fa709aab4f344",
+ "beff1b0c24aa99989be73c66dfb1d1e7578e370b",
+ "c1b583fbd052572f08158d39ffe4d7510dadbebb",
+ "c2526f75a013dc67b14ce1e2d0e4fc80bb93c6e1",
+ "c4abbb284f4acaca9e8ceb88f842901984e84d33",
+ "c58e028269c8dfd3a442f6745c81b4c0e8610c43",
+ "c7220d1bf1e71fb755d9f26bbdd4c539dc162960",
+ "c7687fa6f637c7d32a25be0e772867d87536d35c",
+ "c7d8b37ec99cf0d987e60667f05299f200e18a5d",
+ "c92b9ae9eefa07504950b405625aef54b48f0e1a",
+ "ccb1debcfae006a3fc984e9e91309b9706a5c375",
+ "cd611c765cbb0b3b7cb2fdc07d8f0b9cc93ec257",
+ "cd8a7c3b8d850ebedc1ca791ccb37b9a2689f9c3",
+ "d055c054c330f99ec011e37186d2b429339758fd",
+ "d17b1e254cce82df5c6eb4fd492cef91e7e11558",
+ "d188762a7e3ab5d4ee8a897204316513e4e636ec",
+ "d613b9e4442f5d5d19ea6814fa9729bff7da7c85",
+ "d6b0f405bf13c23d0e90c54eea527442786d1cd3",
+ "d91d3830bf455e6dd782eee46218e35d29f07dfd",
+ "da2211ee2dbc6dda36571976d810e2366a3d2504",
+ "dbb3093a797e0ae83d39eb7b235ff85a17fd965c",
+ "e01bb7256d77aea258313bb410dfcfc10512f420",
+ "e2bf5d0a5885359381fe8ef2cd9290171d494e9b",
+ "e2c3b8a2cf33d5e8972bc9ddb78373766a75e412",
+ "e64714a81f60ab9286ec90cad682cb22e564fb6f",
+ "e9d7716b4f94bbc3d94459b5fe9bb8b15cb2e433",
+ "e9e84e17383e93a784a8471708619162b32fb399",
+ "eac7df5f799983d5a7cc55d10b4d426dc557febf",
+ "eaf84b2efd2f69c7b3f407f89ea66ac4c41fac36",
+ "eb1b39fd7a874896688855a22efddef10272427c",
+ "eb5fffaa590a52bcc3705b888c6ff9c4dc4c45b2",
+ "ecc4b927c5e84e145c610876931bc261ae13769b",
+ "edf8dcc8736f06afbaca0e01d60bd2c475403a3d",
+ "ee2ee6ae2cf05128810d0d95bbe69bd263e140de",
+ "ee9530a2c5a3d1e3813ccb51a55cc8b0d9b5dfc7",
+ "ef1dfa325c21cff4cd8bb1a9b6c4ee6996d43c8f",
+ "ef6749d9263a01f921ba7d72df0d17671d14e5f6",
+ "f0ea221d8587cede25592266486e119d277f7096",
+ "f68f9a9202a75d2aee35252e104d796f9515001e",
+ "f9314d3bf2eac78a7d78d18adcccdb35542054ef",
+ "f932ef936021a3b00842b481478c40868b9a007c",
+ "fd9bd560662e070b222d63052830837829c490f0",
+]
+
+
+@dataclass
+class PdfExtractResult:
+ sha1hex: str
+ status: str
+ error_msg: Optional[str] = None
+ file_meta: Optional[Dict[str, Any]] = None
+ text: Optional[str] = None
+ page0_thumbnail: Optional[bytes] = None
+ has_page0_thumbnail: bool = False
+ meta_xml: Optional[str] = None
+ pdf_info: Optional[Dict[str, Any]] = None
+ pdf_extra: Optional[Dict[str, Any]] = None
+ source: Optional[Dict[str, Any]] = None
+
+ def to_pdftext_dict(self) -> dict:
+ """
+ Outputs a JSON string as would be published to Kafka text/info topic.
+ """
+ return {
+ "key": self.sha1hex,
+ "sha1hex": self.sha1hex,
+ "status": self.status,
+ "file_meta": self.file_meta,
+ "error_msg": self.error_msg,
+ "text": self.text,
+ "has_page0_thumbnail": self.has_page0_thumbnail,
+ "meta_xml": self.meta_xml,
+ "pdf_info": self.pdf_info,
+ "pdf_extra": self.pdf_extra,
+ "source": self.source,
+ }
+
+ @staticmethod
+ def from_pdftext_dict(record: Dict[str, Any]) -> "PdfExtractResult":
+ """
+ Outputs a JSON string as would be published to Kafka text/info topic.
+ """
+ if record["status"] != "success":
+ return PdfExtractResult(
+ sha1hex=record.get("sha1hex") or record["key"],
+ status=record["status"],
+ error_msg=record.get("error_msg"),
+ )
+ else:
+ return PdfExtractResult(
+ sha1hex=record["sha1hex"],
+ status=record["status"],
+ file_meta=record.get("file_meta"),
+ text=record.get("text"),
+ has_page0_thumbnail=bool(record.get("has_page0_thumbnail", False)),
+ meta_xml=record.get("meta_xml"),
+ pdf_info=record.get("pdf_info"),
+ pdf_extra=record.get("pdf_extra"),
+ )
+
+ @staticmethod
+ def from_pdf_meta_dict(record: Dict[str, Any]) -> "PdfExtractResult":
+ """
+ Parses what would be returned from postgrest
+ """
+ if record["status"] != "success":
+ return PdfExtractResult(
+ sha1hex=record["sha1hex"],
+ status=record["status"],
+ error_msg=(record.get("metadata") or {}).get("error_msg"),
+ )
+ else:
+ pdf_extra = dict()
+ for k in (
+ "page_count",
+ "page0_height",
+ "page0_width",
+ "permanent_id",
+ "pdf_version",
+ ):
+ if record.get(k):
+ pdf_extra[k] = record[k]
+ return PdfExtractResult(
+ sha1hex=record["sha1hex"],
+ status=record["status"],
+ has_page0_thumbnail=bool(record.get("has_page0_thumbnail", False)),
+ pdf_info=record.get("metadata"),
+ pdf_extra=pdf_extra,
+ )
+
+ def to_sql_tuple(self) -> tuple:
+ # pdf_meta (sha1hex, updated, status, page0_thumbnail, page_count,
+ # word_count, page0_height, page0_width, permanent_id, pdf_created,
+ # pdf_version, metadata)
+ word_count: Optional[int] = None
+ if self.text:
+ word_count = len(self.text.split())
+ metadata: Optional[Dict] = None
+ pdf_extra = self.pdf_extra or dict()
+ pdf_created = None
+ # TODO: form, encrypted
+ if self.pdf_info:
+ metadata = dict()
+ for k in ("Title", "Subject", "Author", "Creator", "Producer", "doi"):
+ if k in self.pdf_info:
+ metadata[k.lower()] = self.pdf_info[k]
+ if "CreationDate" in self.pdf_info:
+ pdf_created = self.pdf_info["CreationDate"]
+ metadata_json: Optional[str] = None
+ if metadata:
+ metadata_json = json.dumps(metadata, sort_keys=True)
+ return (
+ self.sha1hex,
+ datetime.datetime.now(), # updated
+ self.status,
+ self.has_page0_thumbnail,
+ pdf_extra.get("page_count"),
+ word_count,
+ pdf_extra.get("page0_height"),
+ pdf_extra.get("page0_width"),
+ pdf_extra.get("permanent_id"),
+ pdf_created,
+ pdf_extra.get("pdf_version"),
+ metadata_json,
+ )
+
+
+def process_pdf(
+ blob: bytes, thumb_size: Tuple[int, int] = (180, 300), thumb_type: str = "JPEG"
+) -> PdfExtractResult:
+ """
+ A known issue is that output text is in "physical layout" mode, which means
+ columns will be side-by-side. We would prefer a single stream of tokens!
+
+ Tried using page.text(layout_mode=poppler.TextLayout.raw_order_layout)
+ instead of the default mode (poppler.TextLayout.physical_layout), but that
+ didn't seem to work at all (returned empty strings).
+ """
+ file_meta = gen_file_metadata(blob)
+ sha1hex = file_meta["sha1hex"]
+ if file_meta["mimetype"] != "application/pdf":
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="not-pdf",
+ error_msg=f"mimetype is '{file_meta['mimetype']}'",
+ file_meta=file_meta,
+ )
+
+ if sha1hex in BAD_PDF_SHA1HEX:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="bad-pdf",
+ error_msg="PDF known to cause processing issues",
+ file_meta=file_meta,
+ )
+
+ print(f"\tpoppler processing: {sha1hex}", file=sys.stderr)
+ try:
+ pdf = poppler.load_from_data(blob)
+ if pdf is None:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="empty-pdf",
+ file_meta=file_meta,
+ has_page0_thumbnail=False,
+ )
+ page0 = pdf.create_page(0)
+ if page0 is None:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="empty-page0",
+ file_meta=file_meta,
+ )
+ # this call sometimes fails an returns an AttributeError
+ page0rect = page0.page_rect()
+ # NOTE: poppler sometimes throws a 'ValueError', but this is pretty broad to catch
+ except (AttributeError, poppler.document.LockedDocumentError, ValueError) as e:
+ # may need to expand the set of exceptions caught here over time, but
+ # starting with a narrow set
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="parse-error",
+ error_msg=str(e),
+ file_meta=file_meta,
+ )
+
+ assert page0 is not None
+ page0_thumbnail: Optional[bytes] = None
+ renderer = poppler.PageRenderer()
+ try:
+ full_img = renderer.render_page(page0)
+ img = Image.frombuffer(
+ "RGBA", (full_img.width, full_img.height), full_img.data, "raw", "BGRA", 0, 1
+ )
+ img.thumbnail(thumb_size, Image.BICUBIC)
+ buf = BytesIO()
+ img.save(buf, thumb_type)
+ page0_thumbnail = buf.getvalue()
+ # assuming that very small images mean something went wrong
+ if page0_thumbnail is None or len(page0_thumbnail) < 50:
+ page0_thumbnail = None
+ except Exception as e:
+ print(str(e), file=sys.stderr)
+ page0_thumbnail = None
+
+ try:
+ full_text = page0.text()
+ for n in range(1, pdf.pages):
+ pageN = pdf.create_page(n)
+ full_text += pageN.text()
+ except AttributeError as e:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="parse-error",
+ error_msg=str(e),
+ file_meta=file_meta,
+ )
+
+ # Kafka message size limit; cap at about 1 MByte
+ if len(full_text) > 1000000:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="text-too-large",
+ error_msg="full_text chars: {}".format(len(full_text)),
+ file_meta=file_meta,
+ )
+ if len(pdf.metadata) > 1000000:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="text-too-large",
+ error_msg="meta_xml chars: {}".format(len(full_text)),
+ file_meta=file_meta,
+ )
+
+ try:
+ pdf_info = pdf.infos()
+ except UnicodeDecodeError:
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ status="bad-unicode",
+ error_msg="in infos()",
+ file_meta=file_meta,
+ )
+
+ # TODO: is this actually needed? or does json marshalling work automatically?
+ for k in pdf_info.keys():
+ if isinstance(pdf_info[k], datetime.datetime):
+ pdf_info[k] = datetime.datetime.isoformat(pdf_info[k])
+
+ permanent_id: Optional[str] = None
+ update_id: Optional[str] = None
+ try:
+ permanent_id = pdf.pdf_id.permanent_id
+ update_id = pdf.pdf_id.update_id
+ except TypeError:
+ pass
+
+ return PdfExtractResult(
+ sha1hex=sha1hex,
+ file_meta=file_meta,
+ status="success",
+ error_msg=None,
+ text=full_text or None,
+ has_page0_thumbnail=page0_thumbnail is not None,
+ page0_thumbnail=page0_thumbnail,
+ meta_xml=pdf.metadata or None,
+ pdf_info=pdf_info,
+ pdf_extra=dict(
+ page0_height=page0rect.height,
+ page0_width=page0rect.width,
+ page_count=pdf.pages,
+ permanent_id=permanent_id,
+ update_id=update_id,
+ pdf_version=f"{pdf.pdf_version[0]}.{pdf.pdf_version[1]}",
+ ),
+ )
+
+
+class PdfExtractWorker(SandcrawlerFetchWorker):
+ def __init__(
+ self,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs,
+ ):
+ super().__init__(wayback_client=wayback_client)
+ self.wayback_client = wayback_client
+ self.sink = sink
+ self.thumbnail_sink = kwargs.get("thumbnail_sink")
+
+ def timeout_response(self, task: Dict[str, Any]) -> Dict[str, Any]:
+ default_key = task["sha1hex"]
+ return dict(
+ status="error-timeout",
+ error_msg="internal pdf-extract worker timeout",
+ source=task,
+ sha1hex=default_key,
+ )
+
+ def process(self, record: Any, key: Optional[str] = None) -> dict:
+ fetch_result = self.fetch_blob(record)
+ if fetch_result["status"] != "success":
+ return fetch_result
+ blob: bytes = fetch_result["blob"]
+ assert blob and isinstance(blob, bytes)
+
+ result = process_pdf(blob)
+ result.source = record
+ if self.thumbnail_sink and result.page0_thumbnail is not None:
+ self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
+ return result.to_pdftext_dict()
+
+
+class PdfExtractBlobWorker(SandcrawlerWorker):
+ """
+ This is sort of like PdfExtractWorker, except it receives blobs directly,
+ instead of fetching blobs from some remote store.
+ """
+
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
+ super().__init__()
+ self.sink = sink
+ self.thumbnail_sink = kwargs.get("thumbnail_sink")
+
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
+ if not blob:
+ return None
+ assert isinstance(blob, bytes)
+
+ result = process_pdf(blob)
+ if self.thumbnail_sink and result.page0_thumbnail is not None:
+ self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
+
+ return result.to_pdftext_dict()
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
new file mode 100644
index 0000000..112df6a
--- /dev/null
+++ b/python/sandcrawler/pdftrio.py
@@ -0,0 +1,142 @@
+import time
+from typing import Any, Dict, Optional
+
+import requests
+
+from .ia import WaybackClient
+from .misc import gen_file_metadata, requests_retry_session
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
+
+
+class PdfTrioClient(object):
+ def __init__(self, host_url: str = "http://pdftrio.qa.fatcat.wiki", **kwargs):
+ self.host_url = host_url
+ self.http_session = requests_retry_session(retries=3, backoff_factor=3)
+
+ def classify_pdf(self, blob: bytes, mode: str = "auto") -> Dict[str, Any]:
+ """
+ Returns a dict with at least:
+
+ - status_code (int, always set)
+ - status (success, or error-*)
+
+ On success, the other remote API JSON response keys are also included.
+
+ On HTTP-level failures, the status_code and status field are set
+ appropriately; an optional `error_msg` may also be set. For some other
+ errors, like connection failure, an exception is raised.
+ """
+ assert blob and type(blob) == bytes
+
+ try:
+ pdftrio_response = self.http_session.post(
+ self.host_url + "/classify/research-pub/" + mode,
+ files={
+ "pdf_content": blob,
+ },
+ timeout=60.0,
+ )
+ except requests.Timeout:
+ return {
+ "status": "error-timeout",
+ "status_code": -4, # heritrix3 "HTTP timeout" code
+ "error_msg": "pdftrio request (HTTP POST) timeout",
+ }
+ except requests.exceptions.ConnectionError:
+ # crude back-off
+ time.sleep(2.0)
+ return {
+ "status": "error-connect",
+ "status_code": -2, # heritrix3 "HTTP connect" code
+ "error_msg": "pdftrio request connection timeout",
+ }
+
+ info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code)
+ if pdftrio_response.status_code == 200:
+ resp_json = pdftrio_response.json()
+ assert "ensemble_score" in resp_json
+ assert "status" in resp_json
+ assert "versions" in resp_json
+ info.update(resp_json)
+ else:
+ info["status"] = "error"
+ # TODO: might return JSON with some info?
+
+ info["_total_sec"] = pdftrio_response.elapsed.total_seconds()
+ return info
+
+
+class PdfTrioWorker(SandcrawlerFetchWorker):
+ """
+ This class is basically copied directly from GrobidWorker
+ """
+
+ def __init__(
+ self,
+ pdftrio_client: PdfTrioClient,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs
+ ):
+ super().__init__(wayback_client=wayback_client, **kwargs)
+ self.pdftrio_client = pdftrio_client
+ self.sink = sink
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ start_process = time.time()
+ fetch_sec = None
+
+ start = time.time()
+ fetch_result = self.fetch_blob(record)
+ fetch_sec = time.time() - start
+ if fetch_result["status"] != "success":
+ return fetch_result
+ blob: bytes = fetch_result["blob"]
+ assert blob and isinstance(blob, bytes)
+
+ result = dict()
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
+ result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob)
+ result["source"] = record
+ result["timing"] = dict(
+ pdftrio_sec=result["pdf_trio"].pop("_total_sec", None),
+ total_sec=time.time() - start_process,
+ )
+ if fetch_sec:
+ result["timing"]["fetch_sec"] = fetch_sec
+ return result
+
+
+class PdfTrioBlobWorker(SandcrawlerWorker):
+ """
+ This is sort of like PdfTrioWorker, except it receives blobs directly,
+ instead of fetching blobs from some remote store.
+ """
+
+ def __init__(
+ self,
+ pdftrio_client: PdfTrioClient,
+ sink: Optional[SandcrawlerWorker] = None,
+ mode: str = "auto",
+ **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.pdftrio_client = pdftrio_client
+ self.sink = sink
+ self.mode = mode
+
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
+ start_process = time.time()
+ if not blob:
+ return None
+ assert isinstance(blob, bytes)
+ result = dict()
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
+ result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
+ result["timing"] = dict(
+ pdftrio_sec=result["pdf_trio"].pop("_total_sec", None),
+ total_sec=time.time() - start_process,
+ )
+ return result
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
new file mode 100644
index 0000000..f682572
--- /dev/null
+++ b/python/sandcrawler/persist.py
@@ -0,0 +1,785 @@
+"""
+cdx
+- read raw CDX, filter
+- push to SQL table
+
+ingest-file-result
+- read JSON format (batch)
+- cdx SQL push batch (on conflict skip)
+- file_meta SQL push batch (on conflict update)
+- ingest request push batch (on conflict skip)
+- ingest result push batch (on conflict update)
+
+grobid
+- reads JSON format (batch)
+- grobid2json
+- minio push (one-by-one)
+- grobid SQL push batch (on conflict update)
+- file_meta SQL push batch (on conflict update)
+"""
+
+import os
+import time
+import xml.etree.ElementTree
+from typing import Any, Dict, List, Optional
+
+import psycopg2
+import requests
+
+from sandcrawler.db import SandcrawlerPostgresClient
+from sandcrawler.grobid import GrobidClient
+from sandcrawler.ingest_html import HtmlMetaRow
+from sandcrawler.minio import SandcrawlerMinioClient
+from sandcrawler.pdfextract import PdfExtractResult
+from sandcrawler.workers import SandcrawlerWorker
+
+
+class PersistCdxWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__()
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+ # filter to full CDX lines, no liveweb
+ cdx_batch = [r for r in batch if r.get("warc_path") and ("/" in r["warc_path"])]
+ resp = self.db.insert_cdx(self.cur, cdx_batch)
+ if len(cdx_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(cdx_batch)
+ self.counts["insert-cdx"] += resp[0]
+ self.counts["update-cdx"] += resp[1]
+ self.db.commit()
+ return []
+
+
+class PersistIngestFileResultWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__()
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def request_to_row(self, raw: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+ """
+ Converts ingest-request JSON schema (eg, from Kafka) to SQL ingest_request schema
+
+ if there is a problem with conversion, return None
+ """
+ # backwards compat hacks; transform request to look like current schema
+ if raw.get("ingest_type") == "file":
+ raw["ingest_type"] = "pdf"
+ if (
+ not raw.get("link_source")
+ and raw.get("base_url")
+ and raw.get("ext_ids", {}).get("doi")
+ and raw["base_url"] == "https://doi.org/{}".format(raw["ext_ids"]["doi"])
+ ):
+ # set link_source(_id) for old ingest requests
+ raw["link_source"] = "doi"
+ raw["link_source_id"] = raw["ext_ids"]["doi"]
+ if (
+ not raw.get("link_source")
+ and raw.get("ingest_request_source", "").startswith("savepapernow")
+ and raw.get("fatcat", {}).get("release_ident")
+ ):
+ # set link_source(_id) for old ingest requests
+ raw["link_source"] = "spn"
+ raw["link_source_id"] = raw["fatcat"]["release_ident"]
+
+ for k in ("ingest_type", "base_url", "link_source", "link_source_id"):
+ if k not in raw:
+ self.counts["skip-request-fields"] += 1
+ return None
+ if raw["ingest_type"] not in ("pdf", "xml", "html"):
+ self.counts["skip-ingest-type"] += 1
+ return None
+ # limit on base_url length
+ if len(raw["base_url"]) > 1500:
+ self.counts["skip-url-too-long"] += 1
+ return None
+ request = {
+ "ingest_type": raw["ingest_type"],
+ "base_url": raw["base_url"],
+ "link_source": raw["link_source"],
+ "link_source_id": raw["link_source_id"],
+ "ingest_request_source": raw.get("ingest_request_source"),
+ "request": {},
+ }
+ # extra/optional fields
+ if raw.get("release_stage"):
+ request["release_stage"] = raw["release_stage"]
+ if raw.get("fatcat", {}).get("release_ident"):
+ request["request"]["release_ident"] = raw["fatcat"]["release_ident"]
+ for k in ("ext_ids", "edit_extra", "rel"):
+ if raw.get(k):
+ request["request"][k] = raw[k]
+ # if this dict is empty, trim it to save DB space
+ if not request["request"]:
+ request["request"] = None
+ return request
+
+ def file_result_to_row(self, raw: dict) -> Optional[dict]:
+ """
+ Converts ingest-result JSON schema (eg, from Kafka) to SQL ingest_file_result schema
+
+ if there is a problem with conversion, return None and set skip count
+ """
+ for k in ("request", "hit", "status"):
+ if k not in raw:
+ self.counts["skip-result-fields"] += 1
+ return None
+ if "base_url" not in raw["request"]:
+ self.counts["skip-result-fields"] += 1
+ return None
+ ingest_type = raw["request"].get("ingest_type")
+ if ingest_type == "file":
+ ingest_type = "pdf"
+ if ingest_type not in (
+ "pdf",
+ "xml",
+ "html",
+ "component",
+ "src",
+ "dataset",
+ "dataset-file",
+ ):
+ self.counts["skip-ingest-type"] += 1
+ return None
+ if raw["status"] in ("existing",):
+ self.counts["skip-existing"] += 1
+ return None
+ result = {
+ "ingest_type": ingest_type,
+ "base_url": raw["request"]["base_url"],
+ "hit": raw["hit"],
+ "status": raw["status"],
+ }
+ terminal = raw.get("terminal")
+ if terminal:
+ result["terminal_url"] = terminal.get("terminal_url") or terminal.get("url")
+ result["terminal_dt"] = terminal.get("terminal_dt")
+ result["terminal_status_code"] = (
+ terminal.get("terminal_status_code")
+ or terminal.get("status_code")
+ or terminal.get("http_code")
+ )
+ if result["terminal_status_code"]:
+ result["terminal_status_code"] = int(result["terminal_status_code"])
+ result["terminal_sha1hex"] = terminal.get("terminal_sha1hex")
+ if len(result["terminal_url"]) > 2048:
+ # postgresql13 doesn't like extremely large URLs in b-tree index
+ self.counts["skip-huge-url"] += 1
+ return None
+ return result
+
+ def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]:
+ html_body = record.get("html_body")
+ file_meta = record.get("file_meta")
+ if not (file_meta and html_body):
+ return None
+ return HtmlMetaRow(
+ sha1hex=file_meta["sha1hex"],
+ status=record.get("status"),
+ scope=record.get("scope"),
+ has_teixml=bool(html_body and html_body["status"] == "success"),
+ has_thumbnail=False, # TODO
+ word_count=(html_body and html_body.get("word_count")) or None,
+ biblio=record.get("html_biblio"),
+ resources=record.get("html_resources"),
+ )
+
+ def result_to_platform_row(self, raw: dict) -> Optional[dict]:
+ """
+ Converts fileset ingest-result JSON schema (eg, from Kafka) to SQL ingest_fileset_platform schema
+
+ if there is a problem with conversion, return None and set skip count
+ """
+ for k in ("request", "hit", "status"):
+ if k not in raw:
+ return None
+ if "base_url" not in raw["request"]:
+ return None
+ ingest_type = raw["request"].get("ingest_type")
+ if ingest_type not in ("dataset"):
+ return None
+ if raw["status"] in ("existing",):
+ return None
+ if not raw.get("platform_name"):
+ return None
+ result = {
+ "ingest_type": ingest_type,
+ "base_url": raw["request"]["base_url"],
+ "hit": raw["hit"],
+ "status": raw["status"],
+ "platform_name": raw.get("platform_name"),
+ "platform_domain": raw.get("platform_domain"),
+ "platform_id": raw.get("platform_id"),
+ "ingest_strategy": raw.get("ingest_strategy"),
+ "total_size": raw.get("total_size"),
+ "file_count": raw.get("file_count"),
+ "archiveorg_item_name": raw.get("archiveorg_item_name"),
+ "archiveorg_item_bundle_path": None,
+ "web_bundle_url": None,
+ "web_bundle_dt": None,
+ "manifest": raw.get("manifest"),
+ }
+ if result.get("fileset_bundle"):
+ result["archiveorg_item_bundle_path"] = result["fileset_bundle"].get(
+ "archiveorg_item_bundle_path"
+ )
+ result["web_bundle_url"] = (
+ result["fileset_bundle"].get("terminal", {}).get("terminal_url")
+ )
+ result["web_bundle_dt"] = (
+ result["fileset_bundle"].get("terminal", {}).get("terminal_dt")
+ )
+ return result
+
+ def push_batch(self, batch: List[Any]) -> List[Any]:
+ self.counts["total"] += len(batch)
+
+ if not batch:
+ return []
+
+ results_unfiltered = [self.file_result_to_row(raw) for raw in batch]
+ results = [r for r in results_unfiltered if r]
+
+ irequests_unfiltered = [
+ self.request_to_row(raw["request"]) for raw in batch if raw.get("request")
+ ]
+ irequests = [
+ r for r in irequests_unfiltered if r and r["ingest_type"] != "dataset-file"
+ ]
+
+ if irequests:
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ self.counts["insert-requests"] += resp[0]
+ self.counts["update-requests"] += resp[1]
+ if results:
+ resp = self.db.insert_ingest_file_result(self.cur, results, on_conflict="update")
+ self.counts["insert-results"] += resp[0]
+ self.counts["update-results"] += resp[1]
+
+ # these schemas match, so can just pass through
+ cdx_batch = [r["cdx"] for r in batch if r.get("hit") and r.get("cdx")]
+ revisit_cdx_batch = [
+ r["revisit_cdx"] for r in batch if r.get("hit") and r.get("revisit_cdx")
+ ]
+ cdx_batch.extend(revisit_cdx_batch)
+ # filter to full CDX lines, with full warc_paths (not liveweb)
+ cdx_batch = [r for r in cdx_batch if r.get("warc_path") and ("/" in r["warc_path"])]
+ if cdx_batch:
+ resp = self.db.insert_cdx(self.cur, cdx_batch)
+ self.counts["insert-cdx"] += resp[0]
+ self.counts["update-cdx"] += resp[1]
+
+ file_meta_batch = [r["file_meta"] for r in batch if r.get("hit") and r.get("file_meta")]
+ if file_meta_batch:
+ resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="nothing")
+ self.counts["insert-file_meta"] += resp[0]
+ self.counts["update-file_meta"] += resp[1]
+
+ html_meta_batch = [
+ self.result_to_html_meta(r) for r in batch if r.get("hit") and r.get("html_body")
+ ]
+ if html_meta_batch:
+ rows = [d.to_sql_tuple() for d in html_meta_batch if d]
+ resp = self.db.insert_html_meta(self.cur, rows, on_conflict="update")
+ self.counts["insert-html_meta"] += resp[0]
+ self.counts["update-html_meta"] += resp[1]
+
+ fileset_platform_batch_all = [
+ self.result_to_platform_row(raw)
+ for raw in batch
+ if raw.get("request", {}).get("ingest_type") == "dataset"
+ and raw.get("platform_name")
+ ]
+ fileset_platform_batch: List[Dict] = [p for p in fileset_platform_batch_all if p]
+ if fileset_platform_batch:
+ resp = self.db.insert_ingest_fileset_platform(
+ self.cur, fileset_platform_batch, on_conflict="update"
+ )
+ self.counts["insert-fileset_platform"] += resp[0]
+ self.counts["update-fileset_platform"] += resp[1]
+
+ self.db.commit()
+ return []
+
+
+class PersistIngestFilesetWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__()
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+
+class PersistIngestRequestWorker(PersistIngestFileResultWorker):
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__(db_url=db_url)
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+
+ if not batch:
+ return []
+
+ irequests_all = [self.request_to_row(raw) for raw in batch]
+ irequests: List[Dict] = [r for r in irequests_all if r]
+
+ if irequests:
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ self.counts["insert-requests"] += resp[0]
+ self.counts["update-requests"] += resp[1]
+
+ self.db.commit()
+ return []
+
+
+class PersistGrobidWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__()
+ self.grobid = GrobidClient()
+ self.s3 = SandcrawlerMinioClient(
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
+ )
+ self.s3_only = kwargs.get("s3_only", False)
+ self.db_only = kwargs.get("db_only", False)
+ assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
+ if not self.s3_only:
+ self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url)
+ self.cur: Optional[psycopg2.extensions.cursor] = self.db.conn.cursor()
+ else:
+ self.db = None
+ self.cur = None
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+
+ # filter out bad "missing status_code" timeout rows
+ missing = [r for r in batch if not r.get("status_code")]
+ if missing:
+ self.counts["skip-missing-status"] += len(missing)
+ batch = [r for r in batch if r.get("status_code")]
+
+ for r in batch:
+ if r["status_code"] != 200 or not r.get("tei_xml"):
+ self.counts["s3-skip-status"] += 1
+ if r.get("error_msg"):
+ r["metadata"] = {"error_msg": r["error_msg"][:500]}
+ continue
+
+ assert len(r["key"]) == 40
+ if not self.db_only:
+ self.s3.put_blob(
+ folder="grobid",
+ blob=r["tei_xml"],
+ sha1hex=r["key"],
+ extension=".tei.xml",
+ )
+ self.counts["s3-put"] += 1
+
+ # enhance with GROBID TEI-XML metadata, if available
+ try:
+ metadata = self.grobid.metadata(r)
+ except xml.etree.ElementTree.ParseError as xml_e:
+ r["status"] = "bad-grobid-xml"
+ r["metadata"] = {"error_msg": str(xml_e)[:1024]}
+ continue
+ if not metadata:
+ continue
+ for k in ("fatcat_release", "grobid_version"):
+ r[k] = metadata.pop(k, None)
+ if r.get("fatcat_release"):
+ r["fatcat_release"] = r["fatcat_release"].replace("release_", "")
+ if metadata.get("grobid_timestamp"):
+ r["updated"] = metadata["grobid_timestamp"]
+ r["metadata"] = metadata
+
+ if not self.s3_only:
+ assert self.db and self.cur
+ resp = self.db.insert_grobid(self.cur, batch, on_conflict="update")
+ self.counts["insert-grobid"] += resp[0]
+ self.counts["update-grobid"] += resp[1]
+
+ file_meta_batch = [r["file_meta"] for r in batch if r.get("file_meta")]
+ resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
+ self.counts["insert-file-meta"] += resp[0]
+ self.counts["update-file-meta"] += resp[1]
+
+ self.db.commit()
+
+ return []
+
+
+class PersistGrobidDiskWorker(SandcrawlerWorker):
+ """
+ Writes blobs out to disk.
+
+ This could be refactored into a "Sink" type with an even thinner wrapper.
+ """
+
+ def __init__(self, output_dir: str):
+ super().__init__()
+ self.output_dir = output_dir
+
+ def _blob_path(self, sha1hex: str, extension: str = ".tei.xml") -> str:
+ obj_path = "{}/{}/{}{}".format(
+ sha1hex[0:2],
+ sha1hex[2:4],
+ sha1hex,
+ extension,
+ )
+ return obj_path
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+
+ if record.get("status_code") != 200 or not record.get("tei_xml"):
+ return False
+ assert (len(record["key"])) == 40
+ p = "{}/{}".format(self.output_dir, self._blob_path(record["key"]))
+ os.makedirs(os.path.dirname(p), exist_ok=True)
+ with open(p, "w") as f:
+ f.write(record.pop("tei_xml"))
+ self.counts["written"] += 1
+ return record
+
+
+class PersistPdfTrioWorker(SandcrawlerWorker):
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__()
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+
+ batch = [r for r in batch if "pdf_trio" in r and r["pdf_trio"].get("status_code")]
+ for r in batch:
+ # copy key (sha1hex) into sub-object
+ r["pdf_trio"]["key"] = r["key"]
+ pdftrio_batch = [r["pdf_trio"] for r in batch]
+ resp = self.db.insert_pdftrio(self.cur, pdftrio_batch, on_conflict="update")
+ self.counts["insert-pdftrio"] += resp[0]
+ self.counts["update-pdftrio"] += resp[1]
+
+ file_meta_batch = [
+ r["file_meta"]
+ for r in batch
+ if r["pdf_trio"]["status"] == "success" and r.get("file_meta")
+ ]
+ resp = self.db.insert_file_meta(self.cur, file_meta_batch)
+ self.counts["insert-file-meta"] += resp[0]
+ self.counts["update-file-meta"] += resp[1]
+
+ self.db.commit()
+ return []
+
+
+class PersistPdfTextWorker(SandcrawlerWorker):
+ """
+ Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL table.
+
+ Should keep batch sizes small.
+ """
+
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__()
+ self.s3 = SandcrawlerMinioClient(
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
+ )
+ self.s3_only = kwargs.get("s3_only", False)
+ self.db_only = kwargs.get("db_only", False)
+ assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed"
+ if not self.s3_only:
+ self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url)
+ self.cur: Optional[psycopg2.extensions.cursor] = self.db.conn.cursor()
+ else:
+ self.db = None
+ self.cur = None
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+
+ parsed_batch = []
+ for r in batch:
+ parsed_batch.append(PdfExtractResult.from_pdftext_dict(r))
+
+ for r in parsed_batch:
+ if r.status != "success" or not r.text:
+ self.counts["s3-skip-status"] += 1
+ if r.error_msg:
+ r.metadata = {"error_msg": r.error_msg[:500]}
+ continue
+
+ assert len(r.sha1hex) == 40
+ if not self.db_only:
+ self.s3.put_blob(
+ folder="text",
+ blob=r.text,
+ sha1hex=r.sha1hex,
+ extension=".txt",
+ )
+ self.counts["s3-put"] += 1
+
+ if not self.s3_only:
+ assert self.db and self.cur
+ rows = [r.to_sql_tuple() for r in parsed_batch]
+ resp = self.db.insert_pdf_meta(self.cur, rows, on_conflict="update")
+ self.counts["insert-pdf-meta"] += resp[0]
+ self.counts["update-pdf-meta"] += resp[1]
+
+ file_meta_batch = [r.file_meta for r in parsed_batch if r.file_meta]
+ resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
+ self.counts["insert-file-meta"] += resp[0]
+ self.counts["update-file-meta"] += resp[1]
+
+ self.db.commit()
+
+ return []
+
+
+class PersistThumbnailWorker(SandcrawlerWorker):
+ """
+ Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL
+ table.
+
+ This worker *must* be used with raw kakfa mode; thumbnails are *not*
+ wrapped in JSON like most sandcrawler kafka messages.
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.s3 = SandcrawlerMinioClient(
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
+ )
+ self.s3_extension = kwargs.get("s3_extension", ".jpg")
+ self.s3_folder = kwargs.get("s3_folder", "pdf")
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """
+ Processing raw messages, not decoded JSON objects
+ """
+
+ assert isinstance(record, bytes)
+ blob: bytes = record
+ if isinstance(key, bytes):
+ key = key.decode("utf-8")
+ assert key is not None and len(key) == 40 and isinstance(key, str)
+ assert len(blob) >= 50
+
+ self.s3.put_blob(
+ folder=self.s3_folder,
+ blob=blob,
+ sha1hex=key,
+ extension=self.s3_extension,
+ )
+ self.counts["s3-put"] += 1
+
+
+class GenericPersistDocWorker(SandcrawlerWorker):
+ """
+ Pushes blobs from Kafka to S3.
+
+ Objects are assumed to be JSON-wrapped strings.
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__()
+ self.s3 = SandcrawlerMinioClient(
+ host_url=kwargs.get("s3_url", "localhost:9000"),
+ access_key=kwargs["s3_access_key"],
+ secret_key=kwargs["s3_secret_key"],
+ default_bucket=kwargs["s3_bucket"],
+ )
+ self.s3_extension = kwargs.get("s3_extension", ".unknown")
+ self.s3_folder = kwargs.get("s3_folder", "unknown")
+ self.doc_key = "unknown"
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+
+ if record.get("status") != "success" or not record.get(self.doc_key):
+ return
+
+ assert key is not None
+ if isinstance(key, bytes):
+ key_str = key.decode("utf-8")
+ elif isinstance(key, str):
+ key_str = key
+ assert len(key_str) == 40
+ if "sha1hex" in record:
+ assert key_str == record["sha1hex"]
+
+ self.s3.put_blob(
+ folder=self.s3_folder,
+ blob=record[self.doc_key].encode("utf-8"),
+ sha1hex=key_str,
+ extension=self.s3_extension,
+ )
+ self.counts["s3-put"] += 1
+
+
+class PersistXmlDocWorker(GenericPersistDocWorker):
+ """
+ Pushes TEI-XML file to blob store (S3/seaweed/minio). Does not talk to
+ sandcrawler database (SQL).
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.s3_extension = kwargs.get("s3_extension", ".jats.xml")
+ self.s3_folder = kwargs.get("s3_folder", "xml_doc")
+ self.doc_key = "jats_xml"
+
+
+class PersistHtmlTeiXmlWorker(GenericPersistDocWorker):
+ """
+ Pushes TEI-XML file to blob store (S3/seaweed/minio). Does not talk to
+ sandcrawler database (SQL).
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.s3_extension = kwargs.get("s3_extension", ".tei.xml")
+ self.s3_folder = kwargs.get("s3_folder", "html_body")
+ self.doc_key = "tei_xml"
+
+
+class PersistCrossrefWorker(SandcrawlerWorker):
+ """
+ Pushes Crossref API JSON records into postgresql. Can also talk to GROBID,
+ parsed 'unstructured' references, and push the results in to postgresql at
+ the same time.
+ """
+
+ def __init__(
+ self,
+ db_url: str,
+ grobid_client: Optional[GrobidClient],
+ parse_refs: bool = True,
+ **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+ if grobid_client:
+ self.grobid_client = grobid_client
+ else:
+ self.grobid_client = GrobidClient()
+ self.parse_refs = parse_refs
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+
+ crossref_batch = []
+ refs_batch = []
+ for record in batch:
+ crossref_batch.append(
+ dict(
+ doi=record["DOI"].lower().strip(),
+ indexed=record["indexed"]["date-time"],
+ record=record,
+ )
+ )
+ if self.parse_refs:
+ try:
+ parsed_refs = self.grobid_client.crossref_refs(record)
+ refs_batch.append(parsed_refs)
+ except (
+ xml.etree.ElementTree.ParseError,
+ requests.exceptions.HTTPError,
+ requests.exceptions.ReadTimeout,
+ ):
+ print("GROBID crossref refs parsing error, skipping with a sleep")
+ time.sleep(3)
+ pass
+
+ resp = self.db.insert_crossref(self.cur, crossref_batch)
+ if len(crossref_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(crossref_batch)
+ self.counts["insert-crossref"] += resp[0]
+ self.counts["update-crossref"] += resp[1]
+
+ if refs_batch:
+ resp = self.db.insert_grobid_refs(self.cur, refs_batch)
+ if len(refs_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(refs_batch)
+ self.counts["insert-grobid_refs"] += resp[0]
+ self.counts["update-grobid_refs"] += resp[1]
+
+ self.db.commit()
+ return []
+
+
+class PersistGrobidRefsWorker(SandcrawlerWorker):
+ """
+ Simple persist worker to backfill GROBID references in to postgresql
+ locally. Consumes the JSON output from GROBID CrossrefRefsWorker.
+ """
+
+ def __init__(self, db_url: str, **kwargs):
+ super().__init__(**kwargs)
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ """Only do batches (as transactions)"""
+ raise NotImplementedError
+
+ def push_batch(self, batch: list) -> list:
+ self.counts["total"] += len(batch)
+
+ refs_batch = []
+ for record in batch:
+ assert record["source"]
+ assert record["source_id"]
+ refs_batch.append(record)
+
+ resp = self.db.insert_grobid_refs(self.cur, refs_batch)
+ if len(refs_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(refs_batch)
+ self.counts["insert-grobid_refs"] += resp[0]
+ self.counts["update-grobid_refs"] += resp[1]
+
+ self.db.commit()
+ return []
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 3b46cb7..356f050 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -1,12 +1,22 @@
-
-import sys
import json
-import zipfile
import multiprocessing.pool
+import signal
+import sys
+import time
+import zipfile
from collections import Counter
-from confluent_kafka import Consumer, Producer, KafkaException
+from typing import Any, Dict, List, Optional, Sequence
+
+from confluent_kafka import Consumer, KafkaException, Producer
-from .misc import parse_cdx_line
+from .ia import (
+ PetaboxError,
+ SandcrawlerBackoffError,
+ WaybackClient,
+ WaybackContentError,
+ WaybackError,
+)
+from .misc import parse_cdx_line, requests_retry_session
class SandcrawlerWorker(object):
@@ -17,72 +27,226 @@ class SandcrawlerWorker(object):
worker (pipeline-style), or defaults to stdout.
"""
- def __init__(self):
- self.counts = Counter()
- self.sink = None
- # TODO: self.counters
+ def __init__(self, sink: Optional["SandcrawlerWorker"] = None):
+ self.counts: Counter = Counter()
+ self.sink: Optional[SandcrawlerWorker] = sink
- def push_record(self, task):
- self.counts['total'] += 1
- result = self.process(task)
+ def push_record(self, task: Any, key: Optional[str] = None) -> Any:
+ self.counts["total"] += 1
+ if not self.want(task):
+ self.counts["skip"] += 1
+ return
+ result = self.process(task, key=key)
if not result:
- self.counts['failed'] += 1
+ self.counts["failed"] += 1
return
- elif type(result) == dict and 'status' in result and len(result['status']) < 32:
- self.counts[result['status']] += 1
+ elif type(result) == dict and "status" in result and len(result["status"]) < 32:
+ self.counts[result["status"]] += 1
if self.sink:
self.sink.push_record(result)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
else:
print(json.dumps(result))
return result
- def push_batch(self, tasks):
+ def timeout_response(self, task: Any) -> Any:
+ """
+ This should be overridden by workers that want to return something
+ meaningful when there is a processing timeout. Eg, JSON vs some other
+ error message.
+ """
+ return None
+
+ def push_record_timeout(
+ self, task: Any, key: Optional[str] = None, timeout: int = 300
+ ) -> Any:
+ """
+ A wrapper around self.push_record which sets a timeout.
+
+ Note that this uses signals and *will behave wrong/weirdly* with
+ multithreading or if signal-based timeouts are used elsewhere in the
+ same process.
+ """
+
+ def timeout_handler(signum: int, frame: Any) -> None:
+ raise TimeoutError("timeout processing record")
+
+ signal.signal(signal.SIGALRM, timeout_handler)
+ resp = None
+ signal.alarm(int(timeout))
+ try:
+ resp = self.push_record(task, key=key)
+ except TimeoutError:
+ self.counts["timeout"] += 1
+ resp = self.timeout_response(task) # pylint: disable=assignment-from-none
+ # TODO: what if it is this push_record() itself that is timing out?
+ if resp and self.sink:
+ self.sink.push_record(resp)
+ self.counts["pushed"] += 1
+ elif resp:
+ print(json.dumps(resp))
+ finally:
+ signal.alarm(0)
+ return resp
+
+ def push_batch(self, tasks: List[Any]) -> List[Any]:
results = []
for task in tasks:
results.append(self.push_record(task))
return results
- def finish(self):
+ def finish(self) -> Counter:
if self.sink:
self.sink.finish()
- sys.stderr.write("Worker: {}\n".format(self.counts))
+ print("Worker: {}".format(self.counts), file=sys.stderr)
return self.counts
-class MultiprocessWrapper(SandcrawlerWorker):
+ def want(self, task: Any) -> bool:
+ """
+ Optionally override this as a filter in implementations.
+ """
+ return True
+
+ def process(self, task: Any, key: Optional[str] = None) -> Any:
+ """
+ Derived workers need to implement business logic here.
+
+ TODO: should derived workers explicitly type-check the 'task' object?
+ """
+ raise NotImplementedError("implementation required")
+
+
+class SandcrawlerFetchWorker(SandcrawlerWorker):
+ """
+ Wrapper of SandcrawlerWorker that adds a helper method to fetch blobs (eg,
+ PDFs) from wayback, archive.org, or other sources.
+ """
+
+ def __init__(self, wayback_client: Optional[WaybackClient], **kwargs):
+ super().__init__(**kwargs)
+ self.wayback_client = wayback_client
+ self.http_session = requests_retry_session()
+
+ def fetch_blob(self, record: Dict[str, Any]) -> Dict[str, Any]:
+ default_key = record["sha1hex"]
+ wayback_sec = None
+ petabox_sec = None
+
+ if record.get("warc_path") and record.get("warc_offset"):
+ # it's a full CDX dict. fetch using WaybackClient
+ if not self.wayback_client:
+ raise Exception("wayback client not configured for this SandcrawlerFetchWorker")
+ try:
+ start = time.time()
+ blob: bytes = self.wayback_client.fetch_petabox_body(
+ csize=record["warc_csize"],
+ offset=record["warc_offset"],
+ warc_path=record["warc_path"],
+ )
+ wayback_sec = time.time() - start
+ except (WaybackError, WaybackContentError, PetaboxError, KeyError) as we:
+ return dict(
+ key=default_key,
+ source=record,
+ status="error-wayback",
+ error_msg=str(we),
+ )
+ elif record.get("url") and record.get("datetime"):
+ # it's a partial CDX dict or something? fetch using WaybackClient
+ if not self.wayback_client:
+ raise Exception("wayback client not configured for this SandcrawlerFetchWorker")
+ try:
+ start = time.time()
+ blob = self.wayback_client.fetch_replay_body(
+ url=record["url"],
+ datetime=record["datetime"],
+ )
+ wayback_sec = time.time() - start
+ except (WaybackError, WaybackContentError) as we:
+ return dict(
+ key=default_key,
+ source=record,
+ status="error-wayback",
+ error_msg=str(we),
+ )
+ elif record.get("item") and record.get("path"):
+ # it's petabox link; fetch via HTTP
+ start = time.time()
+ ia_resp = self.http_session.get(
+ "https://archive.org/serve/{}/{}".format(record["item"], record["path"])
+ )
+ petabox_sec = time.time() - start
+ try:
+ ia_resp.raise_for_status()
+ except Exception as e:
+ return dict(
+ key=default_key,
+ source=record,
+ status="error-petabox",
+ error_msg=str(e),
+ )
+ blob = ia_resp.content
+ else:
+ raise ValueError(
+ "not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed"
+ )
+ if not blob:
+ return dict(
+ key=default_key,
+ source=record,
+ status="empty-blob",
+ wayback_sec=wayback_sec,
+ petabox_sec=petabox_sec,
+ )
+ return dict(
+ key=default_key,
+ status="success",
+ source=record,
+ blob=blob,
+ wayback_sec=wayback_sec,
+ petabox_sec=petabox_sec,
+ )
- def __init__(self, worker, sink, jobs=None):
+
+class MultiprocessWrapper(SandcrawlerWorker):
+ def __init__(
+ self,
+ worker: SandcrawlerWorker,
+ sink: Optional[SandcrawlerWorker] = None,
+ jobs: Optional[int] = None,
+ ):
self.counts = Counter()
self.worker = worker
self.sink = sink
self.pool = multiprocessing.pool.Pool(jobs)
- def push_batch(self, tasks):
- self.counts['total'] += len(tasks)
- sys.stderr.write("... processing batch of: {}\n".format(len(tasks)))
+ def push_batch(self, tasks: List[Any]) -> List[Any]:
+ self.counts["total"] += len(tasks)
+ print("... processing batch of: {}".format(len(tasks)), file=sys.stderr)
results = self.pool.map(self.worker.process, tasks)
for result in results:
if not result:
- self.counts['failed'] += 1
- return
- elif type(result) == dict and 'status' in result and len(result['status']) < 32:
- self.counts[result['status']] += 1
+ self.counts["failed"] += 1
+ return []
+ elif type(result) == dict and "status" in result and len(result["status"]) < 32:
+ self.counts[result["status"]] += 1
if self.sink:
self.sink.push_record(result)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
else:
print(json.dumps(result))
return results
- def finish(self):
+ def finish(self) -> Counter:
self.pool.terminate()
if self.sink:
self.sink.finish()
- worker_counts = self.worker.finish()
- sys.stderr.write("Multiprocessing: {}\n".format(self.counts))
- return worker_counts
+ self.worker.finish()
+ print("Multiprocessing: {}".format(self.counts), file=sys.stderr)
+ return self.counts
+
class BlackholeSink(SandcrawlerWorker):
"""
@@ -91,91 +255,97 @@ class BlackholeSink(SandcrawlerWorker):
Useful for tests.
"""
- def push_record(self, task):
+ def push_record(self, task: Any, key: Optional[str] = None) -> Any:
return
- def push_batch(self, tasks):
- return
+ def push_batch(self, tasks: List[Any]) -> List[Any]:
+ return []
-class KafkaSink(SandcrawlerWorker):
- def __init__(self, kafka_hosts, produce_topic, **kwargs):
+class KafkaSink(SandcrawlerWorker):
+ def __init__(self, kafka_hosts: str, produce_topic: str, **kwargs):
self.sink = None
self.counts = Counter()
self.produce_topic = produce_topic
self.kafka_hosts = kafka_hosts
- config = self.producer_config({
- 'bootstrap.servers': kafka_hosts,
- 'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes
- })
+ config = self.producer_config(
+ {
+ "bootstrap.servers": kafka_hosts,
+ "message.max.bytes": 30000000, # ~30 MBytes; broker is ~50 MBytes
+ "api.version.request": True,
+ "api.version.fallback.ms": 0,
+ }
+ )
self.producer = Producer(config)
-
@staticmethod
- def _fail_fast(err, msg):
+ def _fail_fast(err: Any, msg: Any) -> None:
if err is not None:
- sys.stderr.write("Kafka producer delivery error: {}\n".format(err))
- sys.stderr.write("Bailing out...\n")
+ print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
+ print("Bailing out...", file=sys.stderr)
# TODO: should it be sys.exit(-1)?
raise KafkaException(err)
- def producer_config(self, kafka_config):
+ def producer_config(self, kafka_config: dict) -> dict:
config = kafka_config.copy()
- config.update({
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'request.required.acks': -1, # all brokers must confirm
+ config.update(
+ {
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "message.timeout.ms": 30000,
+ "request.required.acks": -1, # all brokers must confirm
+ },
}
- })
+ )
return config
- def push_record(self, msg, key=None):
- self.counts['total'] += 1
+ def push_record(self, msg: Any, key: Optional[str] = None) -> Any:
+ self.counts["total"] += 1
if type(msg) == dict:
- if not key and 'key' in msg:
- key = msg['key']
+ if not key and "key" in msg:
+ key = msg["key"]
msg = json.dumps(msg)
if type(msg) == str:
- msg = msg.encode('utf-8')
+ msg = msg.encode("utf-8")
assert type(msg) == bytes
- self.producer.produce(
- self.produce_topic,
- msg,
- key=key,
- on_delivery=self._fail_fast)
- self.counts['produced'] += 1
+ self.producer.produce(self.produce_topic, msg, key=key, on_delivery=self._fail_fast)
+ self.counts["produced"] += 1
- # TODO: check for errors etc. is this necessary?
+ # check for errors etc
self.producer.poll(0)
- def push_batch(self, msgs):
+ def push_batch(self, msgs: List[Any]) -> List[Any]:
for m in msgs:
self.push_record(m)
+ return []
- def finish(self):
+ def finish(self) -> Counter:
self.producer.flush()
return self.counts
-class KafkaGrobidSink(KafkaSink):
+class KafkaCompressSink(KafkaSink):
"""
Variant of KafkaSink for large documents. Used for, eg, GROBID output.
"""
- def producer_config(self, kafka_config):
+ def producer_config(self, kafka_config: Dict[str, Any]) -> Dict[str, Any]:
config = kafka_config.copy()
- config.update({
- 'compression.codec': 'gzip',
- 'retry.backoff.ms': 250,
- 'linger.ms': 5000,
- 'batch.num.messages': 50,
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'request.required.acks': -1, # all brokers must confirm
+ config.update(
+ {
+ "compression.codec": "gzip",
+ "retry.backoff.ms": 250,
+ "linger.ms": 1000,
+ "batch.num.messages": 50,
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "message.timeout.ms": 30000,
+ "request.required.acks": -1, # all brokers must confirm
+ },
}
- })
+ )
return config
@@ -185,11 +355,11 @@ class RecordPusher:
trivial interface, just wraps an importer and pushes records in to it.
"""
- def __init__(self, worker, **kwargs):
- self.counts = Counter()
- self.worker = worker
+ def __init__(self, worker: SandcrawlerWorker, **kwargs):
+ self.counts: Counter = Counter()
+ self.worker: SandcrawlerWorker = worker
- def run(self):
+ def run(self) -> Counter:
"""
This will look something like:
@@ -202,142 +372,173 @@ class RecordPusher:
class JsonLinePusher(RecordPusher):
-
- def __init__(self, worker, json_file, **kwargs):
+ def __init__(self, worker: SandcrawlerWorker, json_file: Sequence, **kwargs):
self.counts = Counter()
self.worker = worker
self.json_file = json_file
- self.batch_size = kwargs.get('batch_size', None)
+ self.batch_size = kwargs.get("batch_size", None)
if self.batch_size in (0, 1):
self.batch_size = None
- def run(self):
+ def run(self) -> Counter:
batch = []
for line in self.json_file:
if not line:
continue
- self.counts['total'] += 1
- record = json.loads(line)
+ self.counts["total"] += 1
+ try:
+ record = json.loads(line)
+ except json.decoder.JSONDecodeError:
+ self.counts["error-json-decode"] += 1
+ continue
if self.batch_size:
batch.append(record)
if len(batch) >= self.batch_size:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
else:
self.worker.push_record(record)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
if self.batch_size and batch:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
- worker_counts = self.worker.finish()
- sys.stderr.write("JSON lines pushed: {}\n".format(self.counts))
+ self.worker.finish()
+ print("JSON lines pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
class CdxLinePusher(RecordPusher):
-
- def __init__(self, worker, cdx_file, **kwargs):
+ def __init__(self, worker: SandcrawlerWorker, cdx_file: Sequence, **kwargs):
self.counts = Counter()
self.worker = worker
self.cdx_file = cdx_file
- self.filter_http_statuses = kwargs.get('filter_http_statuses', None)
- self.filter_mimetypes = kwargs.get('filter_mimetypes', None)
- self.allow_octet_stream = kwargs.get('allow_octet_stream', False)
- self.batch_size = kwargs.get('batch_size', None)
+ self.filter_http_statuses = kwargs.get("filter_http_statuses", None)
+ self.filter_mimetypes = kwargs.get("filter_mimetypes", None)
+ self.allow_octet_stream = kwargs.get("allow_octet_stream", False)
+ self.batch_size = kwargs.get("batch_size", None)
if self.batch_size in (0, 1):
self.batch_size = None
- def run(self):
+ def run(self) -> Counter:
batch = []
for line in self.cdx_file:
if not line:
continue
- self.counts['total'] += 1
+ self.counts["total"] += 1
record = parse_cdx_line(line, normalize=True)
if not record:
- self.counts['skip-parse'] += 1
+ self.counts["skip-parse"] += 1
continue
- if self.filter_http_statuses and record['http_status'] not in self.filter_http_statuses:
- self.counts['skip-http_status'] += 1
+ if (
+ self.filter_http_statuses
+ and record["http_status"] not in self.filter_http_statuses
+ ):
+ self.counts["skip-http_status"] += 1
continue
- if self.filter_mimetypes and record['mimetype'] not in self.filter_mimetypes:
- self.counts['skip-mimetype'] += 1
+ if self.filter_mimetypes and record["mimetype"] not in self.filter_mimetypes:
+ self.counts["skip-mimetype"] += 1
continue
if self.batch_size:
batch.append(record)
- if len(batch) > self.batch_size:
+ if len(batch) >= self.batch_size:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
else:
self.worker.push_record(record)
- self.counts['pushed'] += 1
+ self.counts["pushed"] += 1
if self.batch_size and batch:
self.worker.push_batch(batch)
- self.counts['pushed'] += len(batch)
+ self.counts["pushed"] += len(batch)
batch = []
- worker_counts = self.worker.finish()
- sys.stderr.write("CDX lines pushed: {}\n".format(self.counts))
+ self.worker.finish()
+ print("CDX lines pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
class ZipfilePusher(RecordPusher):
-
- def __init__(self, worker, zipfile_path, **kwargs):
+ def __init__(self, worker: SandcrawlerWorker, zipfile_path: str, **kwargs):
self.counts = Counter()
self.worker = worker
self.filter_suffix = ".pdf"
self.zipfile_path = zipfile_path
+ self.batch_size = kwargs.get("batch_size", None)
+ if self.batch_size in (0, 1):
+ self.batch_size = None
- def run(self):
- with zipfile.ZipFile(self.zipfile_path, 'r') as archive:
+ def run(self) -> Counter:
+ batch = []
+ with zipfile.ZipFile(self.zipfile_path, "r") as archive:
for zipinfo in archive.infolist():
if not zipinfo.filename.endswith(self.filter_suffix):
continue
- self.counts['total'] += 1
+ self.counts["total"] += 1
# NB doesn't really extract the file, just gives you a stream (file-like-object) for reading it
- flo = archive.open(zipinfo, 'r')
+ flo = archive.open(zipinfo, "r")
data = flo.read(2**32)
flo.close()
- self.worker.push_record(data)
- self.counts['pushed'] += 1
- worker_counts = self.worker.finish()
- sys.stderr.write("ZIP PDFs pushed: {}\n".format(self.counts))
+ if self.batch_size:
+ batch.append(data)
+ if len(batch) >= self.batch_size:
+ self.worker.push_batch(batch)
+ self.counts["pushed"] += len(batch)
+ batch = []
+ else:
+ self.worker.push_record(data)
+ self.counts["pushed"] += 1
+ if self.batch_size and batch:
+ self.worker.push_batch(batch)
+ self.counts["pushed"] += len(batch)
+ batch = []
+ self.worker.finish()
+ print("ZIP PDFs pushed: {}".format(self.counts), file=sys.stderr)
return self.counts
class KafkaJsonPusher(RecordPusher):
-
- def __init__(self, worker, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
+ def __init__(
+ self,
+ worker: SandcrawlerWorker,
+ kafka_hosts: str,
+ consume_topic: str,
+ group: str,
+ **kwargs
+ ):
self.counts = Counter()
self.worker = worker
self.consumer = make_kafka_consumer(
kafka_hosts,
- kafka_env,
- topic_suffix,
+ consume_topic,
group,
)
- self.poll_interval = kwargs.get('poll_interval', 5.0)
- self.batch_size = kwargs.get('batch_size', 100)
+ self.push_batches = kwargs.get("push_batches", False)
+ self.raw_records = kwargs.get("raw_records", False)
+ self.poll_interval = kwargs.get("poll_interval", 5.0)
+ self.batch_size = kwargs.get("batch_size", 100)
if self.batch_size in (0, 1):
self.batch_size = 1
- self.batch_worker = kwargs.get('batch_worker', False)
+ self.batch_worker = kwargs.get("batch_worker", False)
+ self.process_timeout_sec = kwargs.get("process_timeout_sec", 300)
- def run(self):
+ def run(self) -> Counter:
while True:
# TODO: this is batch-oriented, because underlying worker is
# often batch-oriented, but this doesn't confirm that entire batch
- # has been pushed to fatcat before commiting offset. Eg, consider
+ # has been pushed to fatcat before committing offset. Eg, consider
# case where there there is one update and thousands of creates;
# update would be lingering in worker, and if worker crashed
# never created. Not great.
batch = self.consumer.consume(
- num_messages=self.batch_size,
- timeout=self.poll_interval)
- sys.stderr.write("... got {} kafka messages ({}sec poll interval)\n".format(
- len(batch), self.poll_interval))
+ num_messages=self.batch_size, timeout=self.poll_interval
+ )
+ print(
+ "... got {} kafka messages ({}sec poll interval)".format(
+ len(batch), self.poll_interval
+ ),
+ file=sys.stderr,
+ )
if not batch:
# TODO: could have some larger timeout here and
# self.worker.finish() if it's been more than, eg, a couple
@@ -349,19 +550,48 @@ class KafkaJsonPusher(RecordPusher):
raise KafkaException(msg.error())
# ... then process
if self.push_batches:
- self.counts['total'] += len(batch)
- records = [json.loads(msg.value().decode('utf-8')) for msg in batch]
+ self.counts["total"] += len(batch)
+ records = [json.loads(msg.value().decode("utf-8")) for msg in batch]
self.worker.push_batch(records)
- self.counts['pushed'] += len(batch)
- sys.stderr.write("Import counts: {}\n".format(self.worker.counts))
+ self.counts["pushed"] += len(batch)
+ print("Import counts: {}".format(self.worker.counts), file=sys.stderr)
else:
for msg in batch:
- self.counts['total'] += 1
- record = json.loads(msg.value().decode('utf-8'))
- self.worker.push_record(record)
- self.counts['pushed'] += 1
- if self.counts['total'] % 500 == 0:
- sys.stderr.write("Import counts: {}\n".format(self.worker.counts))
+ self.counts["total"] += 1
+ if self.raw_records:
+ # In this mode, pass the Kafka message as bytes through
+ # without decoding as JSON. Eg, for thumbnails (where
+ # message bytes are JPEG, and we need # the sha1hex key
+ # from the message)
+ record = msg.value()
+ else:
+ record = json.loads(msg.value().decode("utf-8"))
+ # This complex bit of code implements backoff/backpressure
+ # in a way that will not cause this Kafka consumer to lose
+ # partition assignments (resulting in a rebalance). This
+ # was needed for the ingest workers. There is probably a
+ # better way to structure this concurrency.
+ done = False
+ while not done:
+ try:
+ # use timeouts; don't want kafka itself to timeout
+ self.worker.push_record_timeout(
+ record, key=msg.key(), timeout=self.process_timeout_sec
+ )
+ break
+ except SandcrawlerBackoffError as be:
+ print("Backing off for 200 seconds: {}".format(be))
+ self.consumer.pause(self.consumer.assignment())
+ for i in range(40):
+ # Beware this poll which should not be
+ # receiving any messages because we are paused!
+ empty_batch = self.consumer.poll(0)
+ assert not empty_batch
+ time.sleep(5)
+ self.consumer.resume(self.consumer.assignment())
+ self.counts["pushed"] += 1
+ if self.counts["total"] % 500 == 0:
+ print("Import counts: {}".format(self.worker.counts), file=sys.stderr)
for msg in batch:
# locally store offsets of processed messages; will be
# auto-commited by librdkafka from this "stored" value
@@ -369,63 +599,65 @@ class KafkaJsonPusher(RecordPusher):
# TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
# commit the current batch if it has been lingering
- worker_counts = self.worker.finish()
- sys.stderr.write("KafkaJson lines pushed: {}\n".format(self.counts))
+ self.worker.finish()
+ print("KafkaJson lines pushed: {}".format(self.counts), file=sys.stderr)
self.consumer.close()
return self.counts
-def make_kafka_consumer(hosts, env, topic_suffix, group):
- topic_name = "fatcat-{}.{}".format(env, topic_suffix)
+def make_kafka_consumer(hosts: str, consume_topic: str, group: str) -> Consumer:
+ topic_name = consume_topic
- def fail_fast(err, partitions):
+ def fail_fast(err: Any, partitions: List[Any]) -> None:
if err is not None:
- sys.stderr.write("Kafka consumer commit error: {}\n".format(err))
- sys.stderr.write("Bailing out...\n")
+ print("Kafka consumer commit error: {}".format(err), file=sys.stderr)
+ print("Bailing out...", file=sys.stderr)
# TODO: should it be sys.exit(-1)?
raise KafkaException(err)
for p in partitions:
# check for partition-specific commit errors
if p.error:
- sys.stderr.write("Kafka consumer commit error: {}\n".format(p.error))
- sys.stderr.write("Bailing out...\n")
+ print("Kafka consumer commit error: {}".format(p.error), file=sys.stderr)
+ print("Bailing out...", file=sys.stderr)
# TODO: should it be sys.exit(-1)?
- raise KafkaException(err)
- #print("Kafka consumer commit successful")
+ raise KafkaException(p.error)
+ # print("Kafka consumer commit successful")
pass
# previously, using pykafka
- #auto_commit_enable=True,
- #auto_commit_interval_ms=30000, # 30 seconds
+ # auto_commit_enable=True,
+ # auto_commit_interval_ms=30000, # 30 seconds
conf = {
- 'bootstrap.servers': hosts,
- 'group.id': group,
- 'on_commit': fail_fast,
- # messages don't have offset marked as stored until pushed to
- # elastic, but we do auto-commit stored offsets to broker
- 'enable.auto.offset.store': False,
- 'enable.auto.commit': True,
+ "bootstrap.servers": hosts,
+ "group.id": group,
+ "on_commit": fail_fast,
+ # messages don't have offset marked as stored until processed,
+ # but we do auto-commit stored offsets to broker
+ "enable.auto.offset.store": False,
+ "enable.auto.commit": True,
# user code timeout; if no poll after this long, assume user code
- # hung and rebalance (default: 5min)
- 'max.poll.interval.ms': 120000,
- 'default.topic.config': {
- 'auto.offset.reset': 'latest',
+ # hung and rebalance (default: 6min)
+ "max.poll.interval.ms": 360000,
+ "default.topic.config": {
+ "auto.offset.reset": "latest",
},
}
- def on_rebalance(consumer, partitions):
+ def on_rebalance(consumer: Any, partitions: List[Any]) -> None:
for p in partitions:
if p.error:
raise KafkaException(p.error)
- sys.stderr.write("Kafka partitions rebalanced: {} / {}\n".format(
- consumer, partitions))
+ print(
+ "Kafka partitions rebalanced: {} / {}".format(consumer, partitions), file=sys.stderr
+ )
consumer = Consumer(conf)
# NOTE: it's actually important that topic_name *not* be bytes (UTF-8
# encoded)
- consumer.subscribe([topic_name],
+ consumer.subscribe(
+ [topic_name],
on_assign=on_rebalance,
on_revoke=on_rebalance,
)
- sys.stderr.write("Consuming from kafka topic {}, group {}\n".format(topic_name, group))
+ print("Consuming from kafka topic {}, group {}".format(topic_name, group), file=sys.stderr)
return consumer
diff --git a/python/sandcrawler/xml.py b/python/sandcrawler/xml.py
new file mode 100644
index 0000000..83d53d4
--- /dev/null
+++ b/python/sandcrawler/xml.py
@@ -0,0 +1,6 @@
+import xml.etree.ElementTree as ET
+
+
+def xml_reserialize(raw: bytes) -> str:
+ root = ET.fromstring(raw)
+ return '<?xml version="1.0" encoding="UTF-8"?>\n' + ET.tostring(root, encoding="unicode")
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
new file mode 100755
index 0000000..aebcbe1
--- /dev/null
+++ b/python/sandcrawler_worker.py
@@ -0,0 +1,495 @@
+#!/usr/bin/env python3
+"""
+These are generally for continuously running workers that consume from Kafka.
+Outputs might either be pushed back into Kafka, or directly into sandcrawler-db
+or S3 (SeaweedFS).
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+
+import sentry_sdk
+
+from sandcrawler import *
+from sandcrawler.persist import (
+ PersistCrossrefWorker,
+ PersistHtmlTeiXmlWorker,
+ PersistXmlDocWorker,
+)
+
+
+def run_grobid_extract(args):
+ consume_topic = "sandcrawler-{}.ungrobided-pg".format(args.env)
+ produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=produce_topic,
+ )
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ wayback_client = WaybackClient(
+ host_url=args.grobid_host,
+ )
+ worker = GrobidWorker(
+ grobid_client=grobid_client,
+ wayback_client=wayback_client,
+ sink=sink,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="grobid-extract",
+ batch_size=1,
+ )
+ pusher.run()
+
+
+def run_pdf_extract(args):
+ consume_topic = "sandcrawler-{}.unextracted".format(args.env)
+ pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ pdftext_sink = KafkaCompressSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=pdftext_topic,
+ )
+ thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=thumbnail_topic,
+ )
+ wayback_client = WaybackClient(
+ host_url=args.grobid_host,
+ )
+ worker = PdfExtractWorker(
+ wayback_client=wayback_client,
+ sink=pdftext_sink,
+ thumbnail_sink=thumbnail_sink,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="pdf-extract",
+ batch_size=1,
+ push_timeout_sec=120,
+ )
+ pusher.run()
+
+
+def run_persist_grobid(args):
+ consume_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ worker = PersistGrobidWorker(
+ db_url=args.db_url,
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ s3_only=args.s3_only,
+ db_only=args.db_only,
+ )
+ kafka_group = "persist-grobid"
+ if args.s3_only:
+ kafka_group += "-s3"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group=kafka_group,
+ push_batches=True,
+ batch_size=25,
+ )
+ pusher.run()
+
+
+def run_persist_pdftext(args):
+ consume_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ worker = PersistPdfTextWorker(
+ db_url=args.db_url,
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ s3_only=args.s3_only,
+ db_only=args.db_only,
+ )
+ kafka_group = "persist-pdf-text"
+ if args.s3_only:
+ kafka_group += "-s3"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group=kafka_group,
+ push_batches=True,
+ batch_size=25,
+ )
+ pusher.run()
+
+
+def run_persist_thumbnail(args):
+ consume_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ worker = PersistThumbnailWorker(
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ s3_extension=".180px.jpg",
+ s3_folder="pdf",
+ )
+ kafka_group = "persist-pdf-thumbnail"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group=kafka_group,
+ push_batches=False,
+ raw_records=True,
+ batch_size=25,
+ )
+ pusher.run()
+
+
+def run_persist_xml_doc(args: argparse.Namespace) -> None:
+ consume_topic = f"sandcrawler-{args.env}.xml-doc"
+ worker = PersistXmlDocWorker(
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ )
+ kafka_group = "persist-xml-doc"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group=kafka_group,
+ push_batches=False,
+ batch_size=25,
+ )
+ pusher.run()
+
+
+def run_persist_html_teixml(args: argparse.Namespace) -> None:
+ consume_topic = f"sandcrawler-{args.env}.html-teixml"
+ worker = PersistHtmlTeiXmlWorker(
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ )
+ kafka_group = "persist-html-teixml"
+ if args.kafka_group_suffix:
+ kafka_group += args.kafka_group_suffix
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group=kafka_group,
+ push_batches=False,
+ batch_size=25,
+ )
+ pusher.run()
+
+
+def run_persist_pdftrio(args):
+ consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env)
+ worker = PersistPdfTrioWorker(
+ db_url=args.db_url,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-pdftrio",
+ push_batches=True,
+ batch_size=100,
+ )
+ pusher.run()
+
+
+def run_ingest_file(args):
+ spn_cdx_retry_sec = 9.0
+ if args.bulk:
+ consume_group = "sandcrawler-{}-ingest-file-bulk".format(args.env)
+ consume_topic = "sandcrawler-{}.ingest-file-requests-bulk".format(args.env)
+ elif args.priority:
+ spn_cdx_retry_sec = 45.0
+ consume_group = "sandcrawler-{}-ingest-file-priority".format(args.env)
+ consume_topic = "sandcrawler-{}.ingest-file-requests-priority".format(args.env)
+ else:
+ spn_cdx_retry_sec = 1.0
+ consume_group = "sandcrawler-{}-ingest-file".format(args.env)
+ consume_topic = "sandcrawler-{}.ingest-file-requests-daily".format(args.env)
+ produce_topic = "sandcrawler-{}.ingest-file-results".format(args.env)
+ grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env)
+ htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env)
+ sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=produce_topic,
+ )
+ grobid_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=grobid_topic,
+ )
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ pdftext_sink = KafkaCompressSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=pdftext_topic,
+ )
+ thumbnail_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=thumbnail_topic,
+ )
+ xmldoc_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=xmldoc_topic,
+ )
+ htmlteixml_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=htmlteixml_topic,
+ )
+ worker = IngestFileWorker(
+ grobid_client=grobid_client,
+ sink=sink,
+ grobid_sink=grobid_sink,
+ thumbnail_sink=thumbnail_sink,
+ pdftext_sink=pdftext_sink,
+ xmldoc_sink=xmldoc_sink,
+ htmlteixml_sink=htmlteixml_sink,
+ # don't SPNv2 for --bulk or --skip-spn
+ try_spn2=not (args.bulk or args.skip_spn),
+ spn_cdx_retry_sec=spn_cdx_retry_sec,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group=consume_group,
+ batch_size=1,
+ )
+ pusher.run()
+
+
+def run_persist_ingest_file(args):
+ consume_topic = "sandcrawler-{}.ingest-file-results".format(args.env)
+ worker = PersistIngestFileResultWorker(
+ db_url=args.db_url,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-ingest",
+ push_batches=True,
+ batch_size=100,
+ )
+ pusher.run()
+
+
+def run_persist_crossref(args):
+ batch_size = 200
+ if args.parse_refs:
+ batch_size = 10
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ consume_topic = "fatcat-{}.api-crossref".format(args.env)
+ worker = PersistCrossrefWorker(
+ db_url=args.db_url,
+ grobid_client=grobid_client,
+ parse_refs=args.parse_refs,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-crossref",
+ push_batches=True,
+ # small batch size because doing GROBID processing
+ batch_size=batch_size,
+ )
+ pusher.run()
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-hosts",
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "--kafka-group-suffix", default="", help="Kafka consumer group suffix (optional)"
+ )
+ parser.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+ parser.add_argument(
+ "--db-url",
+ help="postgresql database connection string",
+ default="postgres:///sandcrawler",
+ )
+ parser.add_argument("--s3-url", help="S3 (seaweedfs) backend URL", default="localhost:9000")
+ parser.add_argument(
+ "--s3-access-key",
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY")
+ or os.environ.get("MINIO_ACCESS_KEY"),
+ )
+ parser.add_argument(
+ "--s3-secret-key",
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get("SANDCRAWLER_BLOB_SECRET_KEY")
+ or os.environ.get("MINIO_SECRET_KEY"),
+ )
+ parser.add_argument(
+ "--s3-bucket", help="S3 (seaweedfs) bucket to persist into", default="sandcrawler-dev"
+ )
+ subparsers = parser.add_subparsers()
+
+ sub_grobid_extract = subparsers.add_parser(
+ "grobid-extract",
+ help="daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka",
+ )
+ sub_grobid_extract.set_defaults(func=run_grobid_extract)
+
+ sub_pdf_extract = subparsers.add_parser(
+ "pdf-extract",
+ help="daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka",
+ )
+ sub_pdf_extract.set_defaults(func=run_pdf_extract)
+
+ sub_persist_grobid = subparsers.add_parser(
+ "persist-grobid",
+ help="daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres",
+ )
+ sub_persist_grobid.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_persist_grobid.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to database (don't upload TEI-XML to S3)",
+ )
+ sub_persist_grobid.set_defaults(func=run_persist_grobid)
+
+ sub_persist_pdftext = subparsers.add_parser(
+ "persist-pdftext",
+ help="daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres",
+ )
+ sub_persist_pdftext.add_argument(
+ "--s3-only",
+ action="store_true",
+ help="only upload TEI-XML to S3 (don't write to database)",
+ )
+ sub_persist_pdftext.add_argument(
+ "--db-only",
+ action="store_true",
+ help="only write status to database (don't upload TEI-XML to S3)",
+ )
+ sub_persist_pdftext.set_defaults(func=run_persist_pdftext)
+
+ sub_persist_thumbnail = subparsers.add_parser(
+ "persist-thumbnail",
+ help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres",
+ )
+ sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail)
+
+ sub_persist_xml_doc = subparsers.add_parser(
+ "persist-xml-doc",
+ help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket",
+ )
+ sub_persist_xml_doc.set_defaults(func=run_persist_xml_doc)
+
+ sub_persist_html_teixml = subparsers.add_parser(
+ "persist-html-teixml",
+ help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket",
+ )
+ sub_persist_html_teixml.set_defaults(func=run_persist_html_teixml)
+
+ sub_persist_pdftrio = subparsers.add_parser(
+ "persist-pdftrio",
+ help="daemon that consumes pdftrio output from Kafka and pushes to postgres",
+ )
+ sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio)
+
+ sub_ingest_file = subparsers.add_parser(
+ "ingest-file",
+ help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka",
+ )
+ sub_ingest_file.add_argument(
+ "--bulk",
+ action="store_true",
+ help="consume from bulk kafka topic (eg, for ingest backfill)",
+ )
+ sub_ingest_file.add_argument(
+ "--skip-spn",
+ action="store_true",
+ help="don't do SPN lookups",
+ )
+ sub_ingest_file.add_argument(
+ "--priority",
+ action="store_true",
+ help="consume from priority kafka topic (eg, for SPN requests)",
+ )
+ sub_ingest_file.set_defaults(func=run_ingest_file)
+
+ sub_persist_ingest_file = subparsers.add_parser(
+ "persist-ingest-file",
+ help="daemon that consumes ingest-file output from Kafka and pushes to postgres",
+ )
+ sub_persist_ingest_file.set_defaults(func=run_persist_ingest_file)
+
+ sub_persist_crossref = subparsers.add_parser(
+ "persist-crossref",
+ help="daemon that persists crossref to postgres; also does GROBID ref transform",
+ )
+ sub_persist_crossref.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+ sub_persist_crossref.add_argument(
+ "--parse-refs",
+ action="store_true",
+ help="use GROBID to parse any unstructured references (default is to not)",
+ )
+ sub_persist_crossref.set_defaults(func=run_persist_crossref)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ parser.print_help(file=sys.stderr)
+ sys.exit(-1)
+
+ # configure sentry *after* parsing args
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
+ args.func(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
new file mode 100755
index 0000000..4561541
--- /dev/null
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+"""
+This script is intended to be used for backfill ingest of old crawls. It can
+also be used as a fast path for getting freshly crawled content into fatcat if
+the crawl was a hit and the arabesque JSON was exported conservatively.
+
+Run like:
+
+ ./arabesque2ingestrequest.py example_arabesque.json --link-source pmc --extid-type pmcid > ingest_requests.json
+
+Can then run through requests using that tool, or dump into kafka queue.
+"""
+
+import argparse
+import json
+import sys
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+ if not row["hit"]:
+ continue
+
+ request = {
+ "base_url": row["final_url"],
+ "ingest_type": args.ingest_type,
+ "link_source": args.link_source,
+ "link_source_id": row["identifier"],
+ "ingest_request_source": args.ingest_request_source,
+ "ext_ids": {
+ args.extid_type: row["identifier"],
+ },
+ }
+ if args.release_stage:
+ assert args.release_stage in (
+ "published",
+ "submitted",
+ "accepted",
+ "draft",
+ "update",
+ )
+ request["release_stage"] = args.release_stage
+
+ print("{}".format(json.dumps(request, sort_keys=True)))
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--link-source", required=True, help="link_source to include in request"
+ )
+ parser.add_argument("--extid-type", required=True, help="extid to encode identifier as")
+ parser.add_argument(
+ "--ingest-type", default="pdf", help="ingest type (pdf, html, xml, etc)"
+ )
+ parser.add_argument(
+ "--ingest-request-source", default="arabesque", help="to include in request"
+ )
+ parser.add_argument("--release-stage", default=None, help="to include in request")
+ parser.add_argument(
+ "json_file", help="arabesque output file to use", type=argparse.FileType("r")
+ )
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
new file mode 100755
index 0000000..6328f52
--- /dev/null
+++ b/python/scripts/archiveorg_fileset.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""
+Helper script to
+
+Takes either two args (release ident and archive.org item), or a stream of
+tab-separated such pairs on stdin.
+
+TODO:
+- should this check the item type?
+"""
+
+import json
+import sys
+from typing import Any
+
+import internetarchive
+
+FORMAT_TO_MIMETYPE = {
+ "BZIP": "application/x-bzip",
+ "BZIP2": "application/x-bzip2",
+ "ZIP": "application/zip",
+ "GZIP": "application/gzip",
+ "RAR": "application/vnd.rar",
+ "TAR": "application/x-tar",
+ "7z": "application/x-7z-compressed",
+ "HTML": "text/html",
+ "Text": "text/plain",
+ "PDF": "application/pdf",
+ "CSV": "text/csv",
+ "XML": "application/xml",
+ "JSON": "application/json",
+ #'application/msword (.doc)', # .doc
+ #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
+ #'application/vnd.ms-excel', # .xls
+ #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
+ "MP3": "audio/mpeg", # .mp3
+ "MP4": "video/mp4", # .mp4
+ "MPEG": "video/mpeg", # .mpeg
+ "JPEG": "image/jpeg",
+ "GIF": "image/gif",
+ "PNG": "image/png",
+ "TIFF": "image/tiff",
+ "Unknown": None,
+}
+
+
+def want_file(f: dict, item_name: str) -> bool:
+ """
+ Filters IA API files
+ """
+ if f.source != "original":
+ return False
+ for suffix in [
+ "_meta.sqlite",
+ "_archive.torrent",
+ "_itemimage.jpg",
+ "_meta.xml",
+ "_thumb.png",
+ "_files.xml",
+ ]:
+ if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
+ return False
+ if f.name.startswith("_"):
+ return False
+ if item_name.startswith("academictorrents_"):
+ for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]:
+ if f.name == item_name + suffix:
+ return False
+ return True
+
+
+def parse_file(f: dict) -> dict:
+ """
+ Takes an IA API file and turns it in to a fatcat fileset manifest file
+ """
+ assert f.name and f.sha1 and f.md5
+ assert f.name is not None
+ mf = {
+ "path": f.name,
+ "size": int(f.size),
+ "sha1": f.sha1,
+ "md5": f.md5,
+ }
+ # TODO: will disable this hard check eventually and replace with:
+ # mimetype = FORMAT_TO_MIMETYPE.get(f.format)
+ mimetype = FORMAT_TO_MIMETYPE[f.format]
+ if mimetype:
+ mf["extra"] = dict(mimetype=mimetype)
+ return mf
+
+
+def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
+ print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
+ if release_id.startswith("release_"):
+ release_id = release_id[9:]
+ assert len(release_id) == 26
+ item = session.get_item(item_name)
+ assert item.metadata["mediatype"] not in ["collection", "web"]
+ item_files = item.get_files(on_the_fly=False)
+ manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
+ fileset = {
+ "manifest": manifest,
+ "urls": [
+ {
+ "rel": "archive",
+ "url": f"https://archive.org/download/{item_name}/",
+ },
+ ],
+ "release_ids": [release_id],
+ # extra={},
+ }
+ print(json.dumps(fileset))
+ return fileset
+
+
+def main():
+ session = internetarchive.get_session()
+ if len(sys.argv) == 3:
+ item_name = sys.argv[1]
+ release_id = sys.argv[2]
+ item_to_fileset(item_name, release_id=release_id, session=session)
+ else:
+ for line in sys.stdin:
+ line = line.strip()
+ if not line:
+ continue
+ fields = line.split("\t")
+ assert len(fields) == 2
+ item_name = fields[0]
+ release_id = fields[1]
+ item_to_fileset(item_name, release_id=release_id, session=session)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
new file mode 100755
index 0000000..0b60da3
--- /dev/null
+++ b/python/scripts/cdx_collection.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Fetches and merges all CDX files for a collection.
+
+Calls metadata API to enumerate all items/files, then fetches and concatanates
+them all. Requires the 'internetarchive' library.
+
+Call with a collection name:
+
+ ./cdx_collection SOME_COLLECTION_NAME
+"""
+
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import internetarchive as ia
+import requests
+
+
+def run():
+
+ if len(sys.argv) != 2:
+ print("Expected a single argument (collection name)")
+ sys.exit(-1)
+
+ collection = sys.argv[1]
+
+ # Check collection name is clean
+ assert collection.replace("_", "").replace("-", "").replace(".", "").isalnum()
+
+ tempdir = tempfile.mkdtemp()
+ print("Looking up collection: {}".format(collection))
+
+ # First fetch list
+ item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection)))
+
+ if len(item_list) == 0:
+ print("No items found, bailing")
+ sys.exit(-1)
+
+ print("Found {} potential items".format(len(item_list)))
+ status = True
+ errors = []
+ for item in item_list:
+ item = item["identifier"]
+ # TODO: error handling
+ try:
+ ret = ia.download(
+ item,
+ files=[item + ".cdx.gz"],
+ verbose=True,
+ destdir=tempdir,
+ no_directory=True,
+ retries=1000,
+ )
+ status = ret and status
+ except requests.exceptions.ReadTimeout as rt:
+ print(str(rt), file=sys.stderr)
+ errors.append(rt)
+ continue
+
+ if errors:
+ print("## Download Errors", file=sys.stderr)
+ for e in errors:
+ print(e, file=sys.stderr)
+
+ # Combine files
+ print("Merging and re-compressing all CDX files...")
+ # subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
+ subprocess.run("zcat {0}/*.cdx.gz | gzip > {0}/combined.gz".format(tempdir), shell=True)
+
+ # Move and cleanup
+ shutil.move("{}/combined.gz".format(tempdir), "{}.cdx.gz".format(collection))
+
+ print("Done!")
+
+
+if __name__ == "__main__":
+ run()
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
new file mode 100755
index 0000000..e3bf4f0
--- /dev/null
+++ b/python/scripts/covid2ingestrequest.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""
+Transform an unpaywall dump (JSON) into ingest requests.
+"""
+
+import argparse
+import json
+import sys
+
+import urlcanon
+
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+
+def transform_cnki(obj):
+
+ requests = []
+ assert obj["cnki_id"]
+
+ requests = []
+ requests.append(
+ {
+ "base_url": canon(obj["info_url"]),
+ "ingest_type": "pdf",
+ "link_source": "cnki_covid19",
+ "link_source_id": obj["cnki_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ )
+ if "read_url" in obj:
+ requests.append(
+ {
+ "base_url": canon(obj["read_url"]),
+ "ingest_type": "pdf", # actually HTML
+ "link_source": "cnki_covid19",
+ "link_source_id": obj["cnki_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ )
+
+ return requests
+
+
+def transform_wanfang(obj):
+
+ assert obj["wanfang_id"]
+ return [
+ {
+ "base_url": canon(obj["url"]),
+ "ingest_type": "pdf",
+ "link_source": "wanfang_covid19",
+ "link_source_id": obj["wanfang_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ ]
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ if "wanfang_id" in row:
+ requests = transform_wanfang(row) or []
+ elif "cnki_id" in row:
+ requests = transform_cnki(row) or []
+ else:
+ continue
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r")
+ )
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 86b3b35..27ccf21 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -19,23 +19,20 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
"""
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
import boto3
-import raven
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+import sentry_sdk
def b32_hex(s):
@@ -45,81 +42,80 @@ def b32_hex(s):
s = s[5:]
if len(s) != 32:
return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
-class DeliverDumpGrobidS3():
+class DeliverDumpGrobidS3:
def __init__(self, s3_bucket, **kwargs):
self.rstore = None
self.count = Counter()
self.s3_bucket = s3_bucket
- self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
- self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
- self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
- self.s3 = boto3.resource('s3')
+ self.s3_prefix = kwargs.get("s3_prefix", "grobid/")
+ self.s3_suffix = kwargs.get("s3_suffix", ".tei.xml")
+ self.s3_storage_class = kwargs.get("s3_storage_class", "STANDARD")
+ self.s3 = boto3.resource("s3")
self.bucket = self.s3.Bucket(self.s3_bucket)
def run(self, dump_file):
sys.stderr.write("Starting...\n")
for line in dump_file:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, grobid_json = line[0], line[1]
if len(sha1_hex) != 40:
sha1_hex = b32_hex(sha1_hex)
assert len(sha1_hex) == 40
grobid = json.loads(grobid_json)
- tei_xml = grobid.get('tei_xml')
+ tei_xml = grobid.get("tei_xml")
if not tei_xml:
print("{}\tskip empty".format(sha1_hex))
- self.count['skip-empty'] += 1
+ self.count["skip-empty"] += 1
continue
- tei_xml = tei_xml.encode('utf-8')
+ tei_xml = tei_xml.encode("utf-8")
# upload to AWS S3
obj = self.bucket.put_object(
- Key="{}{}/{}{}".format(
- self.s3_prefix,
- sha1_hex[0:4],
- sha1_hex,
- self.s3_suffix),
+ Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
Body=tei_xml,
StorageClass=self.s3_storage_class,
)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
- self.count['success-s3'] += 1
+ self.count["success-s3"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--s3-bucket',
- required=True,
- type=str,
- help='AWS S3 bucket to upload into')
- parser.add_argument('--s3-prefix',
- type=str,
- default="grobid/",
- help='key prefix for items created in bucket')
- parser.add_argument('--s3-suffix',
- type=str,
- default=".tei.xml",
- help='file suffix for created objects')
- parser.add_argument('--s3-storage-class',
- type=str,
- default="STANDARD",
- help='AWS S3 storage class (redundancy) to use')
- parser.add_argument('dump_file',
- help="TSV/JSON dump file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+ )
+ parser.add_argument(
+ "--s3-prefix",
+ type=str,
+ default="grobid/",
+ help="key prefix for items created in bucket",
+ )
+ parser.add_argument(
+ "--s3-suffix", type=str, default=".tei.xml", help="file suffix for created objects"
+ )
+ parser.add_argument(
+ "--s3-storage-class",
+ type=str,
+ default="STANDARD",
+ help="AWS S3 storage class (redundancy) to use",
+ )
+ parser.add_argument(
+ "dump_file", help="TSV/JSON dump file", default=sys.stdin, type=argparse.FileType("r")
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverDumpGrobidS3(**args.__dict__)
worker.run(args.dump_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index 3dcf962..093f32a 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -7,160 +7,191 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk.
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
+from http.client import IncompleteRead
-import raven
+import sentry_sdk
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+from wayback.resourcestore import ResourceStore
class DeliverGwbDisk:
-
def __init__(self, disk_dir, **kwargs):
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
self.rstore = None
self.count = Counter()
# /serve/ instead of /download/ doesn't record view count
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get(
+ "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+ )
self.disk_dir = disk_dir
- self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
- self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
+ self.disk_prefix = kwargs.get("disk_prefix", "pdf/")
+ self.disk_suffix = kwargs.get("disk_suffix", ".pdf")
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url,
+ )
+ )
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
+ )
except ValueError as ve:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+ ve
+ ),
+ )
except EOFError as eofe:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+ eofe
+ ),
+ )
except TypeError as te:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ ),
+ )
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
- return None, dict(status="error",
+ return None, dict(
+ status="error",
reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ warc_status=gwb_record.get_status()[0],
+ )
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
- return None, dict(status="error",
- reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return None, dict(
+ status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ ),
+ )
return raw_content, None
def run(self, manifest_file):
sys.stderr.write("Ensuring all 65536 base directories exist...\n")
for i in range(256):
for j in range(256):
- fpath = "{}/{}{:02x}/{:02x}".format(
- self.disk_dir,
- self.disk_prefix,
- i,
- j)
+ fpath = "{}/{}{:02x}/{:02x}".format(self.disk_dir, self.disk_prefix, i, j)
os.makedirs(fpath, exist_ok=True)
sys.stderr.write("Starting...\n")
for line in manifest_file:
- self.count['total'] += 1
- line = line.strip().split('\t')
+ self.count["total"] += 1
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, cdx_json = line[0], line[1]
assert len(sha1_hex) == 40
file_cdx = json.loads(cdx_json)
# If warc is not item/file.(w)arc.gz form, skip it
- if len(file_cdx['warc'].split('/')) != 2:
- sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
- print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
- self.count['skip-warc'] += 1
+ if len(file_cdx["warc"].split("/")) != 2:
+ sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+ self.count["skip-warc"] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(
+ file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+ )
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
- self.count['err-petabox-fetch'] += 1
+ print(
+ "{}\terror petabox\t{}\t{}".format(
+ sha1_hex, file_cdx["warc"], status["reason"]
+ )
+ )
+ self.count["err-petabox-fetch"] += 1
continue
elif not blob:
print("{}\tskip-empty-blob".format(sha1_hex))
- self.count['skip-empty-blob'] += 1
+ self.count["skip-empty-blob"] += 1
continue
# verify sha1
if sha1_hex != hashlib.sha1(blob).hexdigest():
- #assert sha1_hex == hashlib.sha1(blob).hexdigest()
- #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
print("{}\terror petabox-hash-mismatch".format(sha1_hex))
- self.count['err-petabox-hash-mismatch'] += 1
+ self.count["err-petabox-hash-mismatch"] += 1
- self.count['petabox-ok'] += 1
+ self.count["petabox-ok"] += 1
# save to disk
fpath = "{}/{}{}/{}/{}{}".format(
- self.disk_dir,
- self.disk_prefix,
- sha1_hex[0:2],
- sha1_hex[2:4],
- sha1_hex,
- self.disk_suffix)
- with open(fpath, 'wb') as f:
+ self.disk_dir,
+ self.disk_prefix,
+ sha1_hex[0:2],
+ sha1_hex[2:4],
+ sha1_hex,
+ self.disk_suffix,
+ )
+ with open(fpath, "wb") as f:
f.write(blob)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
- self.count['success-disk'] += 1
+ self.count["success-disk"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--disk-dir',
- required=True,
- type=str,
- help='local base directory to save into')
- parser.add_argument('--disk-prefix',
- type=str,
- default="pdf/",
- help='directory prefix for items created in bucket')
- parser.add_argument('--disk-suffix',
- type=str,
- default=".pdf",
- help='file suffix for created files')
- parser.add_argument('--warc-uri-prefix',
- type=str,
- default='https://archive.org/serve/',
- help='URI where WARCs can be found')
- parser.add_argument('manifest_file',
- help="TSV/JSON manifest file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--disk-dir", required=True, type=str, help="local base directory to save into"
+ )
+ parser.add_argument(
+ "--disk-prefix",
+ type=str,
+ default="pdf/",
+ help="directory prefix for items created in bucket",
+ )
+ parser.add_argument(
+ "--disk-suffix", type=str, default=".pdf", help="file suffix for created files"
+ )
+ parser.add_argument(
+ "--warc-uri-prefix",
+ type=str,
+ default="https://archive.org/serve/",
+ help="URI where WARCs can be found",
+ )
+ parser.add_argument(
+ "manifest_file",
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverGwbDisk(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index 39ac000..6f37ede 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -24,7 +24,7 @@ Output:
- log to stdout (redirect to file), prefixed by sha1
Requires:
-- raven (sentry)
+- sentry-sdk
- boto3 (AWS S3 client library)
- wayback/GWB libraries
"""
@@ -33,152 +33,180 @@ Requires:
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os
-import sys
-import json
+import argparse
import base64
import hashlib
-import argparse
+import json
+import os
+import sys
from collections import Counter
+from http.client import IncompleteRead
import boto3
-import raven
+import sentry_sdk
import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
-
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = raven.Client()
+from wayback.resourcestore import ResourceStore
class DeliverGwbS3:
-
def __init__(self, s3_bucket, **kwargs):
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
self.rstore = None
self.count = Counter()
# /serve/ instead of /download/ doesn't record view count
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get(
+ "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+ )
self.s3_bucket = s3_bucket
- self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
- self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
- self.s3 = boto3.resource('s3')
+ self.s3_prefix = kwargs.get("s3_prefix", "pdf/")
+ self.s3_suffix = kwargs.get("s3_suffix", ".pdf")
+ self.s3 = boto3.resource("s3")
self.bucket = self.s3.Bucket(self.s3_bucket)
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url,
+ )
+ )
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
+ )
except ValueError as ve:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+ ve
+ ),
+ )
except EOFError as eofe:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+ eofe
+ ),
+ )
except TypeError as te:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ ),
+ )
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
- return None, dict(status="error",
+ return None, dict(
+ status="error",
reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ warc_status=gwb_record.get_status()[0],
+ )
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
- return None, dict(status="error",
- reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return None, dict(
+ status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ ),
+ )
return raw_content, None
def run(self, manifest_file):
sys.stderr.write("Starting...\n")
for line in manifest_file:
- self.count['total'] += 1
- line = line.strip().split('\t')
+ self.count["total"] += 1
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, cdx_json = line[0], line[1]
assert len(sha1_hex) == 40
file_cdx = json.loads(cdx_json)
# If warc is not item/file.(w)arc.gz form, skip it
- if len(file_cdx['warc'].split('/')) != 2:
- sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
- print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
- self.count['skip-warc'] += 1
+ if len(file_cdx["warc"].split("/")) != 2:
+ sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+ self.count["skip-warc"] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(
+ file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+ )
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
- self.count['err-petabox-fetch'] += 1
+ print(
+ "{}\terror petabox\t{}\t{}".format(
+ sha1_hex, file_cdx["warc"], status["reason"]
+ )
+ )
+ self.count["err-petabox-fetch"] += 1
continue
elif not blob:
print("{}\tskip-empty-blob".format(sha1_hex))
- self.count['skip-empty-blob'] += 1
+ self.count["skip-empty-blob"] += 1
continue
# verify sha1
if sha1_hex != hashlib.sha1(blob).hexdigest():
- #assert sha1_hex == hashlib.sha1(blob).hexdigest()
- #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
print("{}\terror petabox-hash-mismatch".format(sha1_hex))
- self.count['err-petabox-hash-mismatch'] += 1
+ self.count["err-petabox-hash-mismatch"] += 1
- self.count['petabox-ok'] += 1
+ self.count["petabox-ok"] += 1
# upload to AWS S3
obj = self.bucket.put_object(
- Key="{}{}/{}{}".format(
- self.s3_prefix,
- sha1_hex[0:4],
- sha1_hex,
- self.s3_suffix),
- Body=blob)
+ Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
+ Body=blob,
+ )
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
- self.count['success-s3'] += 1
+ self.count["success-s3"] += 1
sys.stderr.write("{}\n".format(self.count))
-@sentry_client.capture_exceptions
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--s3-bucket',
- required=True,
- type=str,
- help='AWS S3 bucket to upload into')
- parser.add_argument('--s3-prefix',
- type=str,
- default="pdf/",
- help='key prefix for items created in bucket')
- parser.add_argument('--s3-suffix',
- type=str,
- default=".pdf",
- help='file suffix for created objects')
- parser.add_argument('--warc-uri-prefix',
- type=str,
- default='https://archive.org/serve/',
- help='URI where WARCs can be found')
- parser.add_argument('manifest_file',
- help="TSV/JSON manifest file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+ )
+ parser.add_argument(
+ "--s3-prefix", type=str, default="pdf/", help="key prefix for items created in bucket"
+ )
+ parser.add_argument(
+ "--s3-suffix", type=str, default=".pdf", help="file suffix for created objects"
+ )
+ parser.add_argument(
+ "--warc-uri-prefix",
+ type=str,
+ default="https://archive.org/serve/",
+ help="URI where WARCs can be found",
+ )
+ parser.add_argument(
+ "manifest_file",
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
+ sentry_sdk.init()
+
worker = DeliverGwbS3(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
new file mode 100755
index 0000000..aef5c12
--- /dev/null
+++ b/python/scripts/doaj2ingestrequest.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Transform an DOAJ article dump (JSON) into ingest requests.
+
+TODO: should we also attempt PDF ingest for HTML links? They seem to often be
+landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url`
+in the HTML headers and adds an ingest request on that basis. Or even just run
+the re-ingest in-process and publish a second result.
+"""
+
+import argparse
+import json
+import sys
+from typing import List, Optional
+
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ # "semanticscholar.org/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+ # large publishers/platforms; may remove in the future
+ # "://link.springer.com/",
+ # "://dergipark.gov.tr/",
+ # "frontiersin.org/",
+ # "scielo",
+]
+
+# these default to PDF; note that we also do pdf ingests for HTML pages
+CONTENT_TYPE_MAP = {
+ "abstract": [],
+ "doc": [],
+ "": ["pdf"],
+ "doi": ["pdf"],
+ "url": ["pdf"],
+ "fulltext": ["pdf"],
+ "anySimpleType": ["pdf"],
+ "application/pdf": ["pdf"],
+ "html": ["html", "pdf"],
+ "text/html": ["html", "pdf"],
+ "xml": ["xml"],
+}
+
+
+def canon(s: str) -> str:
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+
+def transform(obj: dict) -> List[dict]:
+ """
+ Transforms from a single DOAJ object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ doaj_id = obj["id"].lower()
+ assert doaj_id
+
+ bibjson = obj["bibjson"]
+ if not bibjson["link"]:
+ return []
+
+ requests = []
+
+ doi: Optional[str] = None
+ for ident in bibjson["identifier"] or []:
+ if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."):
+ doi = ident["id"].lower()
+
+ for link in bibjson["link"] or []:
+ if link.get("type") != "fulltext" or not link.get("url"):
+ continue
+ ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower())
+ if not ingest_types:
+ continue
+
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in link["url"].lower():
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(link["url"].strip())
+ except UnicodeEncodeError:
+ continue
+
+ if not base_url or len(base_url) > 1000:
+ continue
+
+ for ingest_type in ingest_types:
+ request = {
+ "base_url": base_url,
+ "ingest_type": ingest_type,
+ "link_source": "doaj",
+ "link_source_id": doaj_id,
+ "ingest_request_source": "doaj",
+ "release_stage": "published",
+ "rel": "publisher",
+ "ext_ids": {
+ "doi": doi,
+ "doaj": doaj_id,
+ },
+ "edit_extra": {},
+ }
+ requests.append(request)
+
+ return requests
+
+
+def run(args) -> None:
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r")
+ )
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 9fe1499..44c091c 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -17,29 +17,32 @@ And outputs JSON objects that are can be imported into fatcat with the
No dependencies (only python3 stdlib)
"""
-import sys
-import json
import base64
+import json
+import sys
+
def run():
for line in sys.stdin:
- line = line.split('\t')
+ line = line.split("\t")
assert len(line) == 5
- raw_sha1 = line[0].replace('sha1:', '')
+ raw_sha1 = line[0].replace("sha1:", "")
dois = json.loads(line[1])
cdx = json.loads(line[2])
mimetype = line[3]
size = int(line[4])
- sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+ sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower()
obj = dict(
sha1=sha1,
dois=dois,
- cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+ cdx=[dict(url=cdx["url"], dt=cdx["dt"])],
size=size,
- mimetype=mimetype)
+ mimetype=mimetype,
+ )
print(json.dumps(obj))
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/fetch_cdx_sha1hex.py b/python/scripts/fetch_cdx_sha1hex.py
new file mode 100755
index 0000000..2eb56cb
--- /dev/null
+++ b/python/scripts/fetch_cdx_sha1hex.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+"""
+This is a helper script to take fatcat file entities with partial metadata (eg,
+missing SHA256) and try to find one or more CDX record where the file may be
+found in wayback.
+
+This script uses the sandcrawler library and should be run like:
+
+ head file_export.json | python -m scripts.fetch_cdx_sha1hex > results.json
+"""
+
+import base64
+import json
+import sys
+from typing import List, Optional
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+
+from sandcrawler.ia import CdxApiClient, cdx_to_dict
+
+
+def requests_retry_session(
+ retries: int = 10,
+ backoff_factor: int = 3,
+ status_forcelist: List[int] = [500, 502, 504],
+ session: requests.Session = None,
+) -> requests.Session:
+ """
+ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+ """
+ session = session or requests.Session()
+ retry = Retry(
+ total=retries,
+ read=retries,
+ connect=retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist,
+ )
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
+ return session
+
+
+def b32_hex(s: str) -> str:
+ """
+ Converts a base32-encoded SHA-1 checksum into hex-encoded
+
+ base32 checksums are used by, eg, heritrix and in wayback CDX files
+ """
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ if len(s) == 40:
+ return s
+ raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
+
+SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030"
+
+
+def get_db_cdx(sha1hex: str, http_session) -> List[dict]:
+ resp = http_session.get(
+ SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(sha1hex="eq." + sha1hex)
+ )
+ resp.raise_for_status()
+ rows = resp.json()
+ return rows or []
+
+
+CDX_API_URL = "https://web.archive.org/cdx/search/cdx"
+
+
+def get_api_cdx(url: str, sha1hex: str, cdx_api) -> Optional[dict]:
+
+ params = {
+ "url": url,
+ "output": "json",
+ "matchType": "exact",
+ "limit": 20,
+ # TODO: group-by digest/checksum?
+ # can't filter status because might be warc/revisit
+ # "filter": "statuscode:200",
+ }
+ rows = cdx_api._query_api(params)
+ if not rows:
+ return None
+ for row in rows:
+ if row.sha1hex == sha1hex:
+ return row
+ return None
+
+
+def process_file(fe, session, cdx_api) -> dict:
+ status = "unknown"
+
+ # simple CDX db lookup first
+ cdx_row_list = get_db_cdx(fe["sha1"], http_session=session)
+ if cdx_row_list:
+ return dict(
+ file_entity=fe,
+ cdx_rows=cdx_row_list,
+ status="success-db",
+ )
+
+ original_urls = []
+ for pair in fe["urls"]:
+ u = pair["url"]
+ if not "://web.archive.org/web/" in u:
+ continue
+ seg = u.split("/")
+ assert seg[2] == "web.archive.org"
+ assert seg[3] == "web"
+ if not seg[4].isdigit():
+ continue
+ original_url = "/".join(seg[5:])
+ original_urls.append(original_url)
+
+ if len(original_urls) == 0:
+ return dict(file_entity=fe, status="skip-no-urls")
+
+ found_cdx_rows = []
+ for url in list(set(original_urls)):
+
+ cdx_record = None
+ try:
+ cdx_record = get_api_cdx(original_url, sha1hex=fe["sha1"], cdx_api=cdx_api)
+ except requests.exceptions.HTTPError as e:
+ if e.response.status_code == 403:
+ return dict(file_entity=fe, status="fail-cdx-403")
+ else:
+ raise
+ if cdx_record and cdx_record.sha1hex == fe["sha1"]:
+ found_cdx_rows.append(cdx_to_dict(cdx_record))
+
+ if found_cdx_rows:
+ return dict(
+ file_entity=fe,
+ cdx_rows=found_cdx_rows,
+ status="success-api",
+ )
+
+ return dict(
+ file_entity=fe,
+ status="fail-not-found",
+ )
+
+
+def main():
+ session = requests_retry_session()
+ session.headers.update(
+ {
+ "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot",
+ }
+ )
+ cdx_api = CdxApiClient()
+ for line in sys.stdin:
+ if not line.strip():
+ continue
+ fe = json.loads(line)
+ print(json.dumps(process_file(fe, session=session, cdx_api=cdx_api)))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index c33ab86..8fce0d9 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -1,43 +1,48 @@
#!/usr/bin/env python3
-import sys
import json
+import sys
-with open('title_slug_blacklist.txt', 'r') as f:
- TITLE_BLACKLIST = [l.strip() for l in f]
-
-TITLE_BLACKLIST.extend((
- 'editorial',
- 'advertisement',
- 'bookreviews',
- 'reviews',
- 'nr',
- 'abstractoriginalarticle',
- 'originalarticle',
- 'impactfactor',
- 'articlenumber',
-))
+with open("title_slug_denylist.txt", "r") as f:
+ TITLE_DENYLIST = [l.strip() for l in f]
+
+TITLE_DENYLIST.extend(
+ (
+ "editorial",
+ "advertisement",
+ "bookreviews",
+ "reviews",
+ "nr",
+ "abstractoriginalarticle",
+ "originalarticle",
+ "impactfactor",
+ "articlenumber",
+ )
+)
# The full name can't *entirely* be one of these
-NAME_BLACKLIST = (
- 'phd',
- 'phdstudent',
+NAME_DENYLIST = (
+ "phd",
+ "phdstudent",
)
+
def tokenize(s, remove_whitespace=True):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalpha() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+ return s.encode("ascii", "replace").decode("utf8").replace("?", "")
+
assert tokenize("Impact Factor: 2.114") == "impactfactor"
-assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST
+assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST
+
def filter_title(title):
@@ -45,16 +50,16 @@ def filter_title(title):
if len(title) > 500:
return None
title_slug = tokenize(title, remove_whitespace=True)
- if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST:
+ if len(title_slug) < 10 or title_slug in TITLE_DENYLIST:
return None
- if title_slug.startswith('nr'):
+ if title_slug.startswith("nr"):
return None
- if title.lower().replace('.', '').startswith('int j '):
+ if title.lower().replace(".", "").startswith("int j "):
return None
for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
if title.startswith(prefix):
- title.replace(prefix, '')
+ title.replace(prefix, "")
if title.startswith("The Journal of "):
return None
@@ -78,63 +83,84 @@ def filter_title(title):
return None
# too deep subtitling/splitting
- if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+ if title.count(":") > 3 or title.count("|") > 1 or title.count(".") > 1:
return None
return title
+
def filter_author_name(name):
- name = name['name']
- if name.strip().lower().replace(' ', '') in NAME_BLACKLIST:
+ name = name["name"]
+ if name.strip().lower().replace(" ", "") in NAME_DENYLIST:
return None
- return ' '.join([t for t in name.split() if tokenize(t)])
+ return " ".join([t for t in name.split() if tokenize(t)])
+
def filter_authors(l):
return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
+
def filter_refs(l):
# TODO:
return l
+
def filter_journal_name(name):
- # same blacklist, for now
+ # same denylist, for now
if not name:
return None
- name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+ name = name.replace(" e-ISSN", "").replace(" p-ISSN", "")
slug_name = tokenize(name)
- if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º":
- return None
- for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+ if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
+ return None
+ for prefix in (
+ "/ ",
+ "~ ",
+ "& ",
+ "© ",
+ "Original Research Article ",
+ "Original Article ",
+ "Research Article ",
+ "Available online www.jocpr.com ",
+ ):
if name.startswith(prefix):
- name = name.replace(prefix, '')
- for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+ name = name.replace(prefix, "")
+ for suffix in (
+ " Available online at www.sciarena.com",
+ " Original Article",
+ " Available online at",
+ " ISSN",
+ " ISSUE",
+ ):
if name.endswith(suffix):
- name = name.replace(suffix, '')
+ name = name.replace(suffix, "")
if "====================" in name:
return None
if len(name) > 150:
return None
- return ' '.join(name.split())
+ return " ".join(name.split())
+
def filter_metadata(obj):
- if not (obj.get('title') and obj.get('authors')):
+ if not (obj.get("title") and obj.get("authors")):
return None
- title = filter_title(obj['title'])
+ title = filter_title(obj["title"])
if not title:
- #sys.stderr.write("bad title\n")
+ # sys.stderr.write("bad title\n")
return None
else:
- obj['title'] = title
- obj['authors'] = filter_authors(obj['authors'])
- obj['citations'] = filter_refs(obj['citations'])
- obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+ obj["title"] = title
+ obj["authors"] = filter_authors(obj["authors"])
+ obj["citations"] = filter_refs(obj["citations"])
+ obj["journal"]["name"] = filter_journal_name(obj["journal"]["name"])
return obj
+
def run(invert=False):
for line in sys.stdin:
- fields = line.split('\t')
+ fields = line.split("\t")
if len(fields) == 5:
raw = fields[4]
elif len(fields) == 1:
@@ -151,9 +177,10 @@ def run(invert=False):
fields[4] = processed
else:
fields[0] = processed
- print('\t'.join(fields))
+ print("\t".join(fields))
elif invert:
print(raw.strip())
-if __name__=="__main__":
+
+if __name__ == "__main__":
run(invert="--invert" in sys.argv)
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index bbba770..87dae16 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out:
- dates differ (not just year)
"""
-import sys
import json
+import sys
# out of 1000
SCORE_THRESHOLD = 900
@@ -28,17 +28,19 @@ MAX_SLUG_LINES = 50
REQUIRE_AUTHORS = False
+
def tokenize(s, remove_whitespace=False):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').replace(b'?', b'')
+ return s.encode("ascii", "replace").replace(b"?", b"")
+
def check_authors(left, right):
"""
@@ -51,7 +53,7 @@ def check_authors(left, right):
return False
right_all = tokenize(" ".join(right))
for i in range(len(left)):
- l = left[i].lower().replace('jr.', '').split()
+ l = left[i].lower().replace("jr.", "").split()
if not l:
return False
l = tokenize(l[-1])
@@ -59,20 +61,22 @@ def check_authors(left, right):
# weird author name (single char)
return False
if l not in right_all:
- #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
return False
return True
+
def test_check_authors():
assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
- assert not check_authors([], ['one'])
- assert check_authors(['one'], ['one'])
- assert check_authors(['one two'], ['One Two'])
- assert check_authors(['two'], ['One Two'])
- assert check_authors(['two'], ['two, one'])
- assert check_authors(['mago'], ['Mr. Magoo'])
- assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
- assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+ assert not check_authors([], ["one"])
+ assert check_authors(["one"], ["one"])
+ assert check_authors(["one two"], ["One Two"])
+ assert check_authors(["two"], ["One Two"])
+ assert check_authors(["two"], ["two, one"])
+ assert check_authors(["mago"], ["Mr. Magoo"])
+ assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+ assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
+
# Rows are (score, left, right)
def process_group(rows):
@@ -86,10 +90,10 @@ def process_group(rows):
left = json.loads(row[1])
right = json.loads(row[2])
# authors must roughly match
- if not check_authors(left['authors'], right['authors']):
+ if not check_authors(left["authors"], right["authors"]):
continue
# years must match (if defined)
- if left['year'] and right['year'] and left['year'] != right['year']:
+ if left["year"] and right["year"] and left["year"] != right["year"]:
continue
filtered.append((left, right))
@@ -101,8 +105,8 @@ def process_group(rows):
group_ids = set()
for row in filtered[1:]:
(left, right) = row
- l_id = left['fatcat_release']
- r_id = right['fatcat_release']
+ l_id = left["fatcat_release"]
+ r_id = right["fatcat_release"]
releases[l_id] = left
releases[r_id] = right
if not group_ids:
@@ -119,6 +123,7 @@ def process_group(rows):
print(json.dumps([releases[ident] for ident in group_ids]))
+
def run():
last_slug = None
@@ -126,7 +131,7 @@ def run():
# group lines by slug, and process in batches
for line in sys.stdin:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
assert len(line) == 4
slug = line[0]
if last_slug and slug != last_slug and lines:
@@ -140,5 +145,6 @@ def run():
if lines:
process_group(lines)
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index 3654b87..c5b7eef 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file).
No dependencies (only python3 stdlib)
"""
-import sys
import json
+import sys
# out of 1000
score_threshold = 900
@@ -23,15 +23,16 @@ require_authors = 1
def tokenize(s, remove_whitespace=False):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').replace(b'?', b'')
+ return s.encode("ascii", "replace").replace(b"?", b"")
+
def check_authors(left, right):
"""
@@ -44,7 +45,7 @@ def check_authors(left, right):
return False
right_all = tokenize(" ".join(right))
for i in range(len(left)):
- l = left[i].lower().replace('jr.', '').split()
+ l = left[i].lower().replace("jr.", "").split()
if not l:
return False
l = tokenize(l[-1])
@@ -52,20 +53,22 @@ def check_authors(left, right):
# weird author name (single char)
return False
if l not in right_all:
- #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
return False
return True
+
def test_check_authors():
assert not check_authors([], [])
- assert not check_authors([], ['one'])
- assert check_authors(['one'], ['one'])
- assert check_authors(['one two'], ['One Two'])
- assert check_authors(['two'], ['One Two'])
- assert check_authors(['two'], ['two, one'])
- assert check_authors(['mago'], ['Mr. Magoo'])
- assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
- assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+ assert not check_authors([], ["one"])
+ assert check_authors(["one"], ["one"])
+ assert check_authors(["one two"], ["One Two"])
+ assert check_authors(["two"], ["One Two"])
+ assert check_authors(["two"], ["two, one"])
+ assert check_authors(["mago"], ["Mr. Magoo"])
+ assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+ assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
+
# Rows are (score, grobid, crossref)
def process_group(rows):
@@ -78,20 +81,21 @@ def process_group(rows):
continue
grobid = json.loads(row[1])
crossref = json.loads(row[2])
- if not check_authors(crossref['authors'], grobid['authors']):
- #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
+ if not check_authors(crossref["authors"], grobid["authors"]):
+ # print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
continue
else:
- #print("YES: {} {}".format(crossref['authors'], grobid['authors']))
+ # print("YES: {} {}".format(crossref['authors'], grobid['authors']))
pass
- sha1 = grobid['sha1']
- doi = crossref['doi'].lower()
+ sha1 = grobid["sha1"]
+ doi = crossref["doi"].lower()
l = keepers.get(sha1, list())
l.append(doi)
keepers[sha1] = l
for sha1, doi_list in keepers.items():
print("{}\t{}".format(sha1, json.dumps(doi_list)))
+
def run():
last_slug = None
@@ -99,7 +103,7 @@ def run():
# group lines by slug, and process in batches
for line in sys.stdin:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
assert len(line) == 4
slug = line[0]
if last_slug and slug != last_slug and lines:
@@ -112,5 +116,6 @@ def run():
if lines:
process_group(lines)
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index 79feac1..90a0f77 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
output, converts the XML to JSON, filters out raw affiliation strings, and
@@ -10,43 +9,49 @@ Run in bulk like:
ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
"""
-import sys
import json
+import sys
+
+from grobid_tei_xml import parse_document_xml
-from grobid2json import teixml2json
def parse_hbase(line):
- line = line.split('\t')
+ line = line.split("\t")
assert len(line) == 2
sha1hex = line[0]
obj = json.loads(line[1])
- tei_xml = obj['tei_xml']
+ tei_xml = obj["tei_xml"]
return sha1hex, tei_xml
+
def parse_pg(line):
obj = json.loads(line)
- return obj['sha1hex'], obj['tei_xml']
+ return obj["sha1hex"], obj["tei_xml"]
+
-def run(mode='hbase'):
+def run(mode="hbase"):
for line in sys.stdin:
- if mode == 'hbase':
+ if mode == "hbase":
sha1hex, tei_xml = parse_hbase(line)
- elif mode == 'pg':
+ elif mode == "pg":
sha1hex, tei_xml = parse_pg(line)
else:
- raise NotImplementedError('parse mode: {}'.format(mode))
+ raise NotImplementedError("parse mode: {}".format(mode))
- obj = teixml2json(tei_xml, encumbered=False)
+ tei_doc = parse_document_xml(tei_xml)
+ tei_doc.remove_encumbered()
+ obj = tei_doc.to_legacy_dict()
affiliations = []
- for author in obj['authors']:
- if author.get('affiliation'):
- affiliations.append(author['affiliation'])
+ for author in obj["authors"]:
+ if author.get("affiliation"):
+ affiliations.append(author["affiliation"])
if affiliations:
# don't duplicate affiliations; only the unique ones
affiliations = list(set([json.dumps(a) for a in affiliations]))
affiliations = [json.loads(a) for a in affiliations]
- print('\t'.join([sha1hex, json.dumps(affiliations)]))
+ print("\t".join([sha1hex, json.dumps(affiliations)]))
+
-if __name__=='__main__':
+if __name__ == "__main__":
run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index 3d2e14c..f941881 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -1,69 +1,67 @@
#!/usr/bin/env python3
-import sys
-import json
import datetime
+import json
+import sys
+
+MAX_ABSTRACT_BYTES = 4096
-MAX_ABSTRACT_BYTES=4096
def parse_grobid_json(obj):
- if not obj.get('title'):
+ if not obj.get("title"):
return None
extra = dict()
- if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
- abobj = dict(
- mimetype="text/plain",
- language=None,
- content=obj.get('abstract').strip())
+ if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES:
+ abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip())
abstracts = [abobj]
else:
abstracts = None
contribs = []
- for a in obj.get('authors', []):
+ for a in obj.get("authors", []):
c = dict(raw_name=a, role="author")
contribs.append(c)
refs = []
- for raw in obj.get('citations', []):
+ for raw in obj.get("citations", []):
extra = dict()
ref = dict()
- ref['key'] = raw.get('id')
- if raw.get('title'):
- ref['title'] = raw['title'].strip()
- if raw.get('date'):
+ ref["key"] = raw.get("id")
+ if raw.get("title"):
+ ref["title"] = raw["title"].strip()
+ if raw.get("date"):
try:
- year = int(raw['date'].strip()[:4])
- ref['year'] = year
+ year = int(raw["date"].strip()[:4])
+ ref["year"] = year
except:
pass
- for key in ('volume', 'url', 'issue', 'publisher'):
+ for key in ("volume", "url", "issue", "publisher"):
if raw.get(key):
extra[key] = raw[key].strip()
- if raw.get('authors'):
- extra['authors'] = [a['name'] for a in raw['authors']]
+ if raw.get("authors"):
+ extra["authors"] = [a["name"] for a in raw["authors"]]
if extra:
extra = dict(grobid=extra)
else:
extra = None
- ref['extra'] = extra
+ ref["extra"] = extra
refs.append(ref)
release_type = "journal-article"
release_date = None
- if obj.get('date'):
+ if obj.get("date"):
# TODO: only returns year, ever? how to handle?
- release_date = datetime.datetime(year=obj['date'], month=1, day=1)
+ release_date = datetime.datetime(year=obj["date"], month=1, day=1)
- if obj.get('doi'):
- extra['doi'] = obj['doi']
- if obj['journal'].get('name'):
- extra['container_name'] = obj['journal']['name']
+ if obj.get("doi"):
+ extra["doi"] = obj["doi"].lower()
+ if obj["journal"].get("name"):
+ extra["container_name"] = obj["journal"]["name"]
- extra['is_longtail_oa'] = True
+ extra["is_longtail_oa"] = True
# TODO: ISSN/eISSN handling? or just journal name lookup?
@@ -73,15 +71,17 @@ def parse_grobid_json(obj):
extra = None
return dict(
- title=obj['title'].strip(),
+ title=obj["title"].strip(),
contribs=contribs,
- publisher=obj['journal'].get('publisher'),
- volume=obj['journal'].get('volume'),
- issue=obj['journal'].get('issue'),
+ publisher=obj["journal"].get("publisher"),
+ volume=obj["journal"].get("volume"),
+ issue=obj["journal"].get("issue"),
abstracts=abstracts,
release_type=release_type,
release_date=release_date,
- extra=extra)
+ extra=extra,
+ )
+
def run():
for line in sys.stdin:
@@ -90,5 +90,6 @@ def run():
if out:
print(out)
-if __name__=="__main__":
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
new file mode 100755
index 0000000..8a353ca
--- /dev/null
+++ b/python/scripts/ingestrequest_row2json.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+"""
+This script is used to turn ingest request postgres rows (in JSON export
+format) back in to regular ingest request JSON.
+
+The only difference is the name and location of some optional keys.
+"""
+
+import argparse
+import json
+import sys
+
+
+def transform(row):
+ """
+ dict-to-dict
+ """
+ row.pop("created", None)
+ extra = row.pop("request", None) or {}
+ for k in ("ext_ids", "edit_extra"):
+ if k in extra:
+ row[k] = extra[k]
+ if "release_ident" in extra:
+ row["fatcat"] = dict(release_ident=extra["release_ident"])
+ return row
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ try:
+ req = transform(json.loads(l))
+ except:
+ print(l, file=sys.stderr)
+ if args.force_recrawl:
+ req["force_recrawl"] = True
+ print(json.dumps(req, sort_keys=True))
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="SQL output JSON file to process", type=argparse.FileType("r")
+ )
+ parser.add_argument(
+ "--force-recrawl",
+ action="store_true",
+ help="whether to add recrawl (SPNv2) flag to request",
+ )
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
index 35cee5b..24e22fd 100755
--- a/python/scripts/manifest_converter.py
+++ b/python/scripts/manifest_converter.py
@@ -10,9 +10,9 @@ This was used to convert this manifest:
to JSON format for fast fatcat importing.
"""
-import sys
import json
import sqlite3
+import sys
# iterate over rows in files metadata...
# 1. select all identified DOIs
@@ -20,6 +20,7 @@ import sqlite3
# 2. select all file metadata
# 3. output object
+
def or_none(s):
if s is None:
return None
@@ -27,6 +28,7 @@ def or_none(s):
return None
return s
+
def process_db(db_path):
db = sqlite3.connect(db_path)
@@ -52,5 +54,6 @@ def process_db(db_path):
dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
print(json.dumps(obj))
-if __name__=="__main__":
+
+if __name__ == "__main__":
process_db(sys.argv[1])
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
new file mode 100755
index 0000000..97c38f9
--- /dev/null
+++ b/python/scripts/oai2ingestrequest.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""
+Transform an OAI-PMH bulk dump (JSON) into ingest requests.
+
+Eg: https://archive.org/details/oai_harvest_20200215
+"""
+
+import argparse
+import json
+import sys
+
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ "semanticscholar.org/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+ "://127.0.0.1/",
+ "://www.kb.dk/",
+ "://kb-images.kb.dk/",
+ "://mdz-nbn-resolving.de/",
+ "://aggr.ukm.um.si/",
+ "://edoc.mpg.de/",
+ "doaj.org/",
+ "orcid.org/",
+ "://gateway.isiknowledge.com/",
+ # OAI specific additions
+ "://hdl.handle.net/",
+]
+
+# OAI identifier prefixes for repositories that we want to skip (for various reasons)
+OAI_BLOCKLIST = [
+ "oai:kb.dk:",
+ "oai:bdr.oai.bsb-muenchen.de:",
+ "oai:hispana.mcu.es:",
+ "oai:bnf.fr:",
+ "oai:ukm.si:",
+ "oai:biodiversitylibrary.org:",
+ "oai:hsp.org:",
+ "oai:repec:",
+ "oai:n/a:",
+ "oai:quod.lib.umich.edu:",
+ "oai:americanae.aecid.es:",
+ "oai:www.irgrid.ac.cn:",
+ "oai:espace.library.uq.edu:",
+ "oai:edoc.mpg.de:",
+ "oai:bibliotecadigital.jcyl.es:",
+ "oai:repository.erciyes.edu.tr:",
+ "oai:krm.or.kr:",
+ "oai:hypotheses.org:%",
+]
+
+RELEASE_STAGE_MAP = {
+ "info:eu-repo/semantics/draftVersion": "draft",
+ "info:eu-repo/semantics/submittedVersion": "submitted",
+ "info:eu-repo/semantics/acceptedVersion": "accepted",
+ "info:eu-repo/semantics/publishedVersion": "published",
+ "info:eu-repo/semantics/updatedVersion": "updated",
+}
+
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+
+def transform(obj):
+ """
+ Transforms from a single OAI-PMH object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ requests = []
+ if not obj.get("oai") or not obj["oai"].startswith("oai:"):
+ return []
+ if not obj.get("urls"):
+ return []
+
+ oai_id = obj["oai"].lower()
+ for prefix in OAI_BLOCKLIST:
+ if oai_id.startswith(prefix):
+ return []
+
+ # look in obj['formats'] for PDF?
+ if obj.get("formats"):
+ # if there is a list of formats, and it does not contain PDF, then
+ # skip. Note that we will continue if there is no formats list.
+ has_pdf = False
+ for f in obj["formats"]:
+ if "pdf" in f.lower():
+ has_pdf = True
+ if not has_pdf:
+ return []
+
+ doi = None
+ if obj.get("doi"):
+ doi = obj["doi"][0].lower().strip()
+ if not doi.startswith("10."):
+ doi = None
+
+ # infer release stage and/or type from obj['types']
+ release_stage = None
+ for t in obj.get("types", []):
+ if t in RELEASE_STAGE_MAP:
+ release_stage = RELEASE_STAGE_MAP[t]
+
+ # TODO: infer rel somehow? Eg, repository vs. OJS publisher
+ rel = None
+
+ for url in obj["urls"]:
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in url:
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(url)
+ except UnicodeEncodeError:
+ continue
+
+ request = {
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "oai",
+ "link_source_id": oai_id,
+ "ingest_request_source": "metha-bulk",
+ "release_stage": release_stage,
+ "rel": rel,
+ "ext_ids": {
+ "oai": obj["oai"].lower(),
+ },
+ "edit_extra": {},
+ }
+ if doi:
+ request["ext_ids"]["doi"] = doi
+ requests.append(request)
+
+ return requests
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file",
+ help="OAI-PMH dump file to use (usually stdin)",
+ type=argparse.FileType("r"),
+ )
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
new file mode 100755
index 0000000..8b57c5b
--- /dev/null
+++ b/python/scripts/pdf_thumbnail.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+"""
+Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc).
+
+Originally used to benchmark and compare file size/quality.
+"""
+
+import sys
+
+import poppler
+from PIL import Image
+
+
+def run(inpath, outpath):
+
+ try:
+ pdf = poppler.load_from_file(inpath)
+ page = pdf.create_page(0)
+ except Exception as e:
+ print(str(e), file=sys.stderr)
+ sys.exit(0)
+
+ renderer = poppler.PageRenderer()
+ full_page = renderer.render_page(page)
+ img = Image.frombuffer(
+ "RGBA", (full_page.width, full_page.height), full_page.data, "raw", "BGRA", 0, 1
+ )
+ img.thumbnail((180, 300), Image.BICUBIC)
+ # img.thumbnail((360,600), Image.BICUBIC)
+ img.save(outpath)
+ # img.save(outpath, quality=95)
+
+
+if __name__ == "__main__":
+ if len(sys.argv) != 3:
+ print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
+ sys.exit(-1)
+ run(sys.argv[1], sys.argv[2])
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
new file mode 100755
index 0000000..cb64a1a
--- /dev/null
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""
+Transform an unpaywall dump (JSON) into ingest requests.
+"""
+
+import argparse
+import json
+import sys
+
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ "://doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+]
+
+RELEASE_STAGE_MAP = {
+ "draftVersion": "draft",
+ "submittedVersion": "submitted",
+ "acceptedVersion": "accepted",
+ "publishedVersion": "published",
+ "updatedVersion": "updated",
+}
+
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+
+def transform(obj):
+ """
+ Transforms from a single unpaywall object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ requests = []
+ if not obj["doi"].startswith("10."):
+ return requests
+ if not obj["oa_locations"]:
+ return requests
+
+ for location in obj["oa_locations"]:
+ if not location["url_for_pdf"]:
+ continue
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in location["url_for_pdf"]:
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(location["url_for_pdf"])
+ except UnicodeEncodeError:
+ continue
+
+ request = {
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "unpaywall",
+ "link_source_id": obj["doi"].lower(),
+ "ingest_request_source": "unpaywall",
+ "release_stage": RELEASE_STAGE_MAP.get(location["version"]),
+ "rel": location["host_type"],
+ "ext_ids": {
+ "doi": obj["doi"].lower(),
+ },
+ "edit_extra": {},
+ }
+ if obj.get("oa_status"):
+ request["edit_extra"]["oa_status"] = obj["oa_status"]
+ if location.get("evidence"):
+ request["edit_extra"]["evidence"] = location["evidence"]
+ if location["pmh_id"]:
+ request["ext_ids"]["pmh_id"] = location["pmh_id"]
+ requests.append(request)
+
+ return requests
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="unpaywall dump file to use", type=argparse.FileType("r")
+ )
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json b/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json
new file mode 100644
index 0000000..54d07db
--- /dev/null
+++ b/python/tests/files/crossref_api_work_978-3-030-64953-1_4.json
@@ -0,0 +1 @@
+{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2021,5,10]],"date-time":"2021-05-10T22:08:45Z","timestamp":1620684525878},"publisher-location":"Cham","reference-count":28,"publisher":"Springer International Publishing","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-64953-1_4","type":"book-chapter","created":{"date-parts":[[2021,1,14]],"date-time":"2021-01-14T02:57:20Z","timestamp":1610593040000},"page":"53-71","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Mathematical Knowledge and Mathematical Objects"],"prefix":"10.1007","author":[{"given":"Lars-G\u00f6ran","family":"Johansson","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,1,14]]},"reference":[{"key":"4_CR12","doi-asserted-by":"publisher","volume-title":"Deflating existential consequence: A case for nominalism","author":"J Azzouni","year":"2004","unstructured":"Azzouni, J. (2004). Deflating existential consequence: A case for nominalism. New York: Oxford University Press.","DOI":"10.1093\/0195159888.001.0001"},{"key":"4_CR23","doi-asserted-by":"publisher","volume-title":"Foundations of constructive mathematics","author":"M Beeson","year":"1985","unstructured":"Beeson, M. (1985). Foundations of constructive mathematics. Berlin\/Heidelberg: Springer.","DOI":"10.1007\/978-3-642-68952-9"},{"issue":"2","key":"4_CR27","doi-asserted-by":"publisher","first-page":"176","DOI":"10.1093\/philmat\/11.2.176","volume":"11","author":"H Billinge","year":"2003","unstructured":"Billinge, H. (2003). Did bishop have a philosophy of mathematics? Philosophica Mathematica, 11(2), 176\u2013194.","journal-title":"Philosophica Mathematica"},{"key":"4_CR29","doi-asserted-by":"publisher","volume-title":"Constructive analysis","author":"E Bishop","year":"1985","unstructured":"Bishop, E., & Bridges, D. S. (1985). Constructive analysis. Berlin: Springer.","DOI":"10.1007\/978-3-642-61667-9"},{"key":"4_CR37","series-title":"In E. N. Zalta (Ed.), The Stanford encyclopedia of philosophy (spring 2014 ed.)","volume-title":"Nominalism in the philosophy of mathematics","author":"O Bueno","year":"2014","unstructured":"Bueno, O. (2014). Nominalism in the philosophy of mathematics. In E. N. Zalta (Ed.), The Stanford encyclopedia of philosophy (spring 2014 ed.). Metaphysics Research Lab, Stanford University."},{"key":"4_CR38","volume-title":"Grundlagen einer allgemeinen mannigfaltiglehre. ein mathematisch-philosophisher versuch in der leher de unendlichen","author":"G Cantor","year":"1883","unstructured":"Cantor, G. (1883). Grundlagen einer allgemeinen mannigfaltiglehre. ein mathematisch-philosophisher versuch in der leher de unendlichen. Leipzig: Teubner."},{"key":"4_CR60","volume-title":"The seas of language","author":"M Dummett","year":"1993","unstructured":"Dummett, M. (1993). The seas of language. Oxford: Clarendon Press."},{"key":"4_CR73","volume-title":"In the light of logic","author":"S Feferman","year":"1998","unstructured":"Feferman, S. (1998). In the light of logic. New York: Oxford University Press."},{"key":"4_CR74","doi-asserted-by":"publisher","first-page":"590","DOI":"10.1093\/0195148770.003.0019","volume-title":"The Oxford handbook of philosophy of mathematics and logic","author":"S Feferman","year":"2005","unstructured":"Feferman, S. (2005). Predicativity. In S. Shapiro (Ed.), The Oxford handbook of philosophy of mathematics and logic (pp. 590\u2013624). New York\/Oxford: Oxford University Press."},{"key":"4_CR77","volume-title":"Science without numbers: A defence of nominalism","author":"H H Field","year":"1980","unstructured":"Field, H. H. (1980). Science without numbers: A defence of nominalism. Oxford: Blackwell."},{"key":"4_CR88","volume-title":"Werke, volume 8","author":"C F Gauss","year":"2011","unstructured":"Gauss, C. F. (2011). Werke, volume 8. Cambridge: Cambridge University Press."},{"key":"4_CR93","unstructured":"Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155\u2013172). Bobs-Merrill company."},{"key":"4_CR103","volume-title":"Mathematics without numbers: Towards a modal-structural interpretation","author":"G Hellman","year":"1989","unstructured":"Hellman, G. (1989). Mathematics without numbers: Towards a modal-structural interpretation. Oxford: Clarendon Press."},{"key":"4_CR126","first-page":"201","volume-title":"Bertrand Russell. Philosopher of the century","author":"G Kreisel","year":"1967","unstructured":"Kreisel, G. (1967). Mathematical logic: What has it done for the philosophy of mathematics? In R. Shoenman (Ed.), Bertrand Russell. Philosopher of the century (pp. 201\u2013272). London: George Allen & Unwin."},{"key":"4_CR135","doi-asserted-by":"crossref","unstructured":"Lear, J. (1980). Aristotelian infinity. Proceedings of the Aristotelian Society, New Series, 80, 187\u2013210.","DOI":"10.1093\/aristotelian\/80.1.187"},{"key":"4_CR175","doi-asserted-by":"publisher","first-page":"63","DOI":"10.12775\/LLP.1998.004","volume":"6","author":"F Pataut","year":"1998","unstructured":"Pataut, F. (1998). Incompleteness, constructivism and truth. Logic and Logical Philosophy, 6, 63\u201376.","journal-title":"Logic and Logical Philosophy"},{"key":"4_CR180","first-page":"294","volume":"14","author":"H Poincar\u00e9","year":"1906","unstructured":"Poincar\u00e9, H. (1906). Les math\u00e9matiques et la logique. Revue de m\u00e9taphysique et de morale, 14, 294\u2013317.","journal-title":"Revue de m\u00e9taphysique et de morale"},{"key":"4_CR190","volume-title":"Word and object","author":"W V O Quine","year":"1960","unstructured":"Quine, W. V. O. (1960). Word and object. Cambridge, MA: MIT Press."},{"key":"4_CR193","unstructured":"Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133\u2013136). Cambridge, MA: Harvard University Press."},{"key":"4_CR197","first-page":"31","volume-title":"Theories and things","author":"W V O Quine","year":"1981","unstructured":"Quine, W. V. O. (1981c). What price bivalence? In Theories and things (pp. 31\u201337). Cambridge, MA: The Belknap Press of Harvard University Press."},{"issue":"1","key":"4_CR198","doi-asserted-by":"publisher","first-page":"5","DOI":"10.2307\/2026889","volume":"89","author":"WV O Quine","year":"1992","unstructured":"Quine, W.V. O. (1992). Structure and nature. The Journal of Philosophy, 89(1), 5\u20139.","journal-title":"The Journal of Philosophy"},{"key":"4_CR199","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1080\/014453401625669","volume":"25","author":"P Raatikainen","year":"2004","unstructured":"Raatikainen, P. (2004). Conceptions of truth in intuitionism. History and Philosophy of Logic, 25, 131\u2013145.","journal-title":"History and Philosophy of Logic"},{"key":"4_CR210","unstructured":"Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29\u201353."},{"key":"4_CR212","volume-title":"Introduction to mathematical philosophy","author":"B Russell","year":"1919","unstructured":"Russell, B. (1919). Introduction to mathematical philosophy. London: Routledge."},{"key":"4_CR222","doi-asserted-by":"crossref","unstructured":"Schwarz, J. T. (2006(1966)). The pernicious influence of mathematics on science. In R. Hersch (Ed.), 18 unconventional essays on the nature of mathematics (Chap. 13, pp. 231\u2013235). New York: Springer.","DOI":"10.1007\/0-387-29831-2_13"},{"key":"4_CR233","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1007\/BF00247187","volume":"12","author":"G Sundholm","year":"1983","unstructured":"Sundholm, G. (1983). Constructions, proofs and the meaning of logical constants. Journal of Philosophical Logic, 12, 151\u2013172.","journal-title":"Journal of Philosophical Logic"},{"issue":"2","key":"4_CR235","doi-asserted-by":"publisher","first-page":"101","DOI":"10.1007\/s10701-007-9186-9","volume":"38","author":"M Tegmark","year":"2008","unstructured":"Tegmark, M. (2008). The mathematical universe. Foundations of Physics, 38(2), 101\u2013150.","journal-title":"Foundations of Physics"},{"key":"4_CR262","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1016\/0010-0277(90)90003-3","volume":"36","author":"K Wynn","year":"1990","unstructured":"Wynn, K. (1990). Children\u2019s understanding of counting. Cognition, 36, 155\u2013193.","journal-title":"Cognition"}],"container-title":["Synthese Library","Empiricism and Philosophy of Physics"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-64953-1_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,1,14]],"date-time":"2021-01-14T03:00:39Z","timestamp":1610593239000},"score":1,"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":28,"URL":"http:\/\/dx.doi.org\/10.1007\/978-3-030-64953-1_4","relation":{},"ISSN":["0166-6991","2542-8292"],"issn-type":[{"value":"0166-6991","type":"print"},{"value":"2542-8292","type":"electronic"}],"published":{"date-parts":[[2021]]},"assertion":[{"value":"14 January 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}} \ No newline at end of file
diff --git a/python/tests/files/crossref_api_work_s1047951103000064.json b/python/tests/files/crossref_api_work_s1047951103000064.json
new file mode 100644
index 0000000..dfb795d
--- /dev/null
+++ b/python/tests/files/crossref_api_work_s1047951103000064.json
@@ -0,0 +1 @@
+{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2021,6,10]],"date-time":"2021-06-10T05:35:02Z","timestamp":1623303302043},"reference-count":46,"publisher":"Cambridge University Press (CUP)","issue":"1","license":[{"start":{"date-parts":[[2005,4,18]],"date-time":"2005-04-18T00:00:00Z","timestamp":1113782400000},"content-version":"unspecified","delay-in-days":807,"URL":"https:\/\/www.cambridge.org\/core\/terms"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Cardiol Young"],"published-print":{"date-parts":[[2003,2]]},"abstract":"<jats:p>We designed a multi-hospital prospective study of children less than 12 years to determine the comparative clinical profile, severity of carditis, and outcome on follow up of patients suffering an initial and recurrent episodes of acute rheumatic fever. The study extended over a period of 3 years, with diagnosis based on the Jones criteria. We included 161 children in the study, 57 having only one episode and 104 with recurrent episodes. Those seen in the first episode were differentiated from those with recurrent episodes on the basis of the history. The severity of carditis was graded by clinical and echocardiographic means. In those suffering their first episode, carditis was significantly less frequent (61.4%) compared to those having recurrent episodes (96.2%). Arthritis was more marked in the first episode (61.4%) compared to recurrent episodes (36.5%). Chorea was also significantly higher in the first episode (15.8%) compared to recurrent episodes (3.8%). Sub-cutaneous nodules were more-or-less the same in those suffering the first (7%) as opposed to recurrent episodes (5.8%), but Erythema marginatum was more marked during the first episode (3.5%), being rare in recurrent episodes at 0.9%. Fever was recorded in approximately the same numbers in first (45.6%) and recurrent episodes (48.1%). Arthralgia, in contrast, was less frequent in first (21.1%) compared to recurrent episodes (32.7%). A history of sore throat was significantly increased amongst those suffering the first episode (54.4%) compared to recurrent episodes (21.2%). When we compared the severity of carditis in the first versus recurrent episodes, at the start of study mild carditis was found in 29.8% versus 10.6%, moderate carditis in 26.3% versus 53.8%, and severe carditis in 5.3% versus 31.8% of cases, respectively. At the end of study, 30.3% of patients suffering their first episode were completely cured of carditis, and all others showed significant improvement compared to those with recurrent episodes, where only 6.8% were cured, little improvement or deterioration being noted in the remainder of the patients. We conclude that the clinical profile of acute rheumatic fever, especially that of carditis, is milder in those suffering their first attack compared to those with recurrent episodes.<\/jats:p>","DOI":"10.1017\/s1047951103000064","type":"journal-article","created":{"date-parts":[[2005,4,18]],"date-time":"2005-04-18T11:49:54Z","timestamp":1113824994000},"page":"28-35","source":"Crossref","is-referenced-by-count":11,"title":["Clinical profile of acute rheumatic fever in Pakistan"],"prefix":"10.1017","volume":"13","author":[{"given":"Hasina Suleman","family":"Chagani","sequence":"first","affiliation":[]},{"given":"Kalimuddin","family":"Aziz","sequence":"additional","affiliation":[]}],"member":"56","published-online":{"date-parts":[[2005,4,18]]},"reference":[{"key":"S1047951103000064_ref010","doi-asserted-by":"crossref","unstructured":"Alan L , Bisno . Group A streptococcal infection and acute rheumatic fever. N Engl J Med 1991; 325: 783\u2013793.","DOI":"10.1056\/NEJM199109123251106"},{"key":"S1047951103000064_ref036","doi-asserted-by":"crossref","unstructured":"Abbasi AS , Hashmi JA , Robinson RD , Suraya S , Syed SA . Prevalence of heart disease in school children of Karachi. Am J Cardiol 1966; 18: 544\u2013547.","DOI":"10.1016\/0002-9149(66)90008-7"},{"key":"S1047951103000064_ref025","unstructured":"Strasser T , Dondong N , Elkholy A et al. The community control of rheumatic fever and rheumatic heart disease. Report of a WHO international co-operative project. Bull. WHO 1981; 59: 285\u2013294."},{"key":"S1047951103000064_ref013","unstructured":"Rahimtoola RJ , Rehman H . Acute rheumatic fever in children. JPMA 1972; 22: 185\u2013192."},{"key":"S1047951103000064_ref007","doi-asserted-by":"crossref","unstructured":"Okoroma EO , Ihenacho HNC , Anyanwu CH . Rheumatic fever in Nigerian children. A prospective study of 66 patients. Am J Dis Child 1981; 35: 236\u2013238.","DOI":"10.1001\/archpedi.1981.02130270028010"},{"key":"S1047951103000064_ref031","doi-asserted-by":"crossref","unstructured":"Gordis L . Effectiveness of comprehensive care program in preventing rheumatic fever. N Engl J Med 1973; 289: 331\u2013335.","DOI":"10.1056\/NEJM197308162890701"},{"key":"S1047951103000064_ref012","unstructured":"Ismail SA , El Amin A . Rheumatic fever in Sudanese children. Arab J Med 1983; 2: 21\u201324."},{"key":"S1047951103000064_ref026","doi-asserted-by":"crossref","unstructured":"Reale A , Colella C , Bruno AM . Mitral stenosis in childhood: Clinical and therapeutic aspects. Am Heart J 1963; 66: 15.","DOI":"10.1016\/0002-8703(63)90064-4"},{"key":"S1047951103000064_ref046","doi-asserted-by":"crossref","unstructured":"Aziz KU , Cheema L , Memon AD . Long-term observations of rheumatic carditis. Cardiol Young 1992; 2: 254\u2013260.","DOI":"10.1017\/S1047951100001001"},{"key":"S1047951103000064_ref041","unstructured":"Aziz KU . Incidence of heart disease in children at NICVD. JPMA 1984; 34: 300\u2013305."},{"key":"S1047951103000064_ref002","unstructured":"Cheadle WB . The various manifestations of rheumatic fever as exemplified in childhood and early life. Smith and Co., London, 1889."},{"key":"S1047951103000064_ref043","unstructured":"Community control of rheumatic heart disease in developing countries-I. A major public health problem. WHO Chron 1980; 34: 336\u2013345."},{"key":"S1047951103000064_ref037","unstructured":"Malik SM , Jaffrey S , Ahmed S , Zubeda Khanum : Prevalence of heart disease in school children of Islamabad. Pakistan Heart Journal 1981; 14: 2\u20136."},{"key":"S1047951103000064_ref029","doi-asserted-by":"crossref","unstructured":"Hassel TA , Stuart KL . Rheumatic fever prophylaxis. A three-year study. Br Med J 1972; 2: 39\u201340.","DOI":"10.1136\/bmj.2.5909.39"},{"key":"S1047951103000064_ref024","doi-asserted-by":"crossref","unstructured":"Sanyal SK , Berry AM , Duggal S , Hooja V , Ghosh S . Sequel of initial attack of acute rheumatic fever. A prospective 5-year follow-up study. Circulation 1982; 65: 375\u2013379.","DOI":"10.1161\/01.CIR.65.2.375"},{"key":"S1047951103000064_ref022","doi-asserted-by":"crossref","unstructured":"Brownell KD , Rese FB . Acute rheumatic fever in children. Incidence in Borough of New York city. JAMA. 1973; 224: 1593\u20131597.","DOI":"10.1001\/jama.1973.03220260015004"},{"key":"S1047951103000064_ref035","unstructured":"Watkins JH , Quinn JP . Rheumatic heart disease and overcrowding. Am J Public Health 1948; 38: 1071\u20131081."},{"key":"S1047951103000064_ref003","unstructured":"El-Sadr W , Taranta A . The spectrum and specter of rheumatic fever in 1980's. In: Clinical Immunology Up-Date. Edited by Franklin EC . Elsevier, New York, 1979, pp 183\u2013203."},{"key":"S1047951103000064_ref045","doi-asserted-by":"crossref","unstructured":"Markowitz M . Eradication of rheumatic fever. An unfulfilled hope. Circulation 1970; 41: 1077\u20131084.","DOI":"10.1161\/01.CIR.41.6.1077"},{"key":"S1047951103000064_ref005","unstructured":"Haig-Brown C . Tonsillitis in adolescent, Bailliere Tendoll and Cox, London 1886."},{"key":"S1047951103000064_ref017","unstructured":"Levine LI , Chapman SS , Guerra V , Cooper J , Krause RM . Studies on the transmission within the families of group A hemolytic streptococci. J Lab Clin Med 1966; 67: 483\u2013494."},{"key":"S1047951103000064_ref028","doi-asserted-by":"crossref","unstructured":"Ehmke DA , Stehbens JA , Young L . Two studies of compliance with daily prophylaxis in rheumatic fever patients in Iowa. Am J Public Health 1980; 70: 1189\u20131193.","DOI":"10.2105\/AJPH.70.11.1189"},{"key":"S1047951103000064_ref021","doi-asserted-by":"crossref","unstructured":"Ward C . The reappraisal of the clinical features in acute and chronic rheumatic heart disease. Etiology implications. Am Heart J 1979; 98: 298\u2013306.","DOI":"10.1016\/0002-8703(79)90040-1"},{"key":"S1047951103000064_ref009","doi-asserted-by":"crossref","unstructured":"Sanyal SK , Thaper MK , Ahmed SA , Hooja V , Tewari P . The initial attack of acute rheumatic fever during childhood in North India. A prospective study of the clinical profile. Circulation 1974; 49: 7\u201312.","DOI":"10.1161\/01.CIR.49.1.7"},{"key":"S1047951103000064_ref016","unstructured":"Strasser T . Rheumatic fever and rheumatic heart disease in the 1970's. WHO Chron. 1978; 32: 18\u201325."},{"key":"S1047951103000064_ref019","doi-asserted-by":"crossref","unstructured":"Bland EF , Jones TD . Rheumatic fever and rheumatic heart disease. A twenty-year report on 1000 patients followed since childhood. Circulation 1951; 4: 836\u2013843.","DOI":"10.1161\/01.CIR.4.6.836"},{"key":"S1047951103000064_ref042","doi-asserted-by":"crossref","unstructured":"Wood HF , McCarty M . Laboratory aids in the diagnosis of rheumatic fever and evaluation of disease activity. Am J Med 1954; 17: 768\u2013774.","DOI":"10.1016\/0002-9343(54)90221-1"},{"key":"S1047951103000064_ref020","doi-asserted-by":"crossref","unstructured":"Baldwin JS , Kerr JM , Kuttner AG , Doyle EF . Observation in rheumatic nodules over 30 years period. J Pediatr 1960; 56: 465\u2013470.","DOI":"10.1016\/S0022-3476(60)80358-7"},{"key":"S1047951103000064_ref004","doi-asserted-by":"crossref","unstructured":"Majeed HA , Khan N , Dabbagh M , Naidi K . Acute rheumatic fever during childhood in Kuwait: The mild nature of initial attack. Ann Trop Paediatr 1981; 1: 13\u201320.","DOI":"10.1080\/02724936.1981.11748053"},{"key":"S1047951103000064_ref001","unstructured":"Brittanica: Book of year 1991. Chicago, 1991."},{"key":"S1047951103000064_ref039","unstructured":"Talbot R . Pockets of rheumatic fever in developed world. XI World Congress of Cardiology. Manila 1990."},{"key":"S1047951103000064_ref040","doi-asserted-by":"crossref","unstructured":"Taranta A , Markowitz M . Rheumatic fever. A guide to its recognition, prevention and cure, with special reference to developing countries. M.T.P. Press Ltd., Boston, 1981.","DOI":"10.1007\/978-94-015-7171-5"},{"key":"S1047951103000064_ref032","unstructured":"Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease. Circulation 1970; 41: A1\u201315."},{"key":"S1047951103000064_ref014","unstructured":"Rahimtoola RJ , Shafqat H , Ramzan A . Acute rheumatic fever and rheumatic carditis in children. Pak Heart J 1980; 3: 2\u20139."},{"key":"S1047951103000064_ref011","doi-asserted-by":"crossref","unstructured":"Gharib R . Acute rheumatic fever in Shiraz, Iran. It's prevalence and characteristics in two socio-economic groups. Am J Dis Child 1969: 118: 694\u2013699.","DOI":"10.1001\/archpedi.1969.02100040696005"},{"key":"S1047951103000064_ref008","unstructured":"Padmavati S . Rheumatic fever and rheumatic heart disease in developing countries. Bull. WHO 1979; 56: 543\u2013550."},{"key":"S1047951103000064_ref033","doi-asserted-by":"crossref","unstructured":"Spagnuolo M , Pasternack B , Taranta A . Risk of rheumatic fever recurrences after streptococcal infections. Prospective study of clinical and social factors. N Engl J Med 1971; 285: 641\u2013647.","DOI":"10.1056\/NEJM197109162851201"},{"key":"S1047951103000064_ref038","unstructured":"Meyer RJ , Haggerty RJ . Streptococcal infections in families. Factors altering individual susceptibility. Pediatrics 1962; 29: 539\u2013549."},{"key":"S1047951103000064_ref023","doi-asserted-by":"crossref","unstructured":"Feinstein AR , Spagnuolo M . The clinical patterns of acute rheumatic fever; A reappraisal. Medicine 1962; 41: 279\u2013305.","DOI":"10.1097\/00005792-196212000-00001"},{"key":"S1047951103000064_ref018","unstructured":"Shanks RA . Collagen and connective tissue diseases. In: Forfar JA , Arneil CC (eds) Textbook of Pediatrics. Churchill Livingstone, Edinburgh, 1978: 1501\u20131515."},{"key":"S1047951103000064_ref027","unstructured":"Billoo AG , Abbasi AS , Sultana S , Desa L , Syed SA . Prophylaxis against recurrence of rheumatic fever. Pak Heart J 1968; 1: 8\u201314."},{"key":"S1047951103000064_ref034","unstructured":"Syed SA . Rheumatic heart disease. Pak Heart J 1972; 5: 14\u201316."},{"key":"S1047951103000064_ref044","unstructured":"Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control. WHO Chron 1980; 34: 389\u2013395."},{"key":"S1047951103000064_ref006","unstructured":"Joshi MK , Kandoth PW , Barve RJ , Kamat JR . Rheumatic fever: Clinical profile of 339 cases with long term follow-up. Indian pediatr 1983; 20: 849\u2013853."},{"key":"S1047951103000064_ref030","unstructured":"Koshi G , Benjamin V , Cherian G . Rheumatic fever and rheumatic heart disease in rural south Indian children. Bull WHO 1981; 59: 599\u2013603."},{"key":"S1047951103000064_ref015","doi-asserted-by":"crossref","unstructured":"Robinson RD , Sultana S , Abbasi AS et al. Acute rheumatic fever in Karachi, Pakistan. Am J Cardiol 1966; 8: 548\u2013551.","DOI":"10.1016\/0002-9149(66)90009-9"}],"container-title":["Cardiology in the Young"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.cambridge.org\/core\/services\/aop-cambridge-core\/content\/view\/S1047951103000064","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,4,6]],"date-time":"2020-04-06T22:32:57Z","timestamp":1586212377000},"score":1,"subtitle":[],"short-title":[],"issued":{"date-parts":[[2003,2]]},"references-count":46,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2003,2]]}},"alternative-id":["S1047951103000064"],"URL":"http:\/\/dx.doi.org\/10.1017\/s1047951103000064","relation":{},"ISSN":["1047-9511","1467-1107"],"issn-type":[{"value":"1047-9511","type":"print"},{"value":"1467-1107","type":"electronic"}],"subject":["Cardiology and Cardiovascular Medicine","General Medicine","Pediatrics, Perinatology, and Child Health"],"published":{"date-parts":[[2003,2]]}}} \ No newline at end of file
diff --git a/python/tests/files/dlib_05vanhyning.html b/python/tests/files/dlib_05vanhyning.html
new file mode 100644
index 0000000..dbe3ef7
--- /dev/null
+++ b/python/tests/files/dlib_05vanhyning.html
@@ -0,0 +1,350 @@
+<!DOCTYPE html>
+<html lang="en" itemscope itemtype="http://schema.org/Article">
+<head>
+<script type="text/javascript" src="/js/ga.js"></script>
+<style type="text/css">
+
+.topLeft { border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.topLeftThick { border-top: 2px solid #000000;
+ border-left: 1px solid #000000;
+ vertical-align: text-top;
+ }
+
+.topLeftRight {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-right: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.topLeftRightThick {border-top: 2px solid #000000;
+ border-left: 1px solid #000000;
+ border-right: 1px solid #000000;
+ vertical-align: text-top;
+ }
+
+.topLeftBottom {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-bottom: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.all {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-bottom: 1px solid #000000;
+ border-right: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+table.plain {border-collapse: separate;
+ border-spacing: 0px;
+ margin-left: auto;
+ margin-right: auto;
+ }
+td.plain {padding: 6px;
+ vertical-align: text-top;
+ }
+
+table.author {border-collapse: separate;
+ border-spacing: 6px;
+ }
+td.authors {padding: 6px;
+ }
+
+li:not(:last-child) {
+ margin-bottom: .5em;
+ }
+
+div.center {margin-left: auto; margin-right: auto;
+ }
+
+</style>
+<meta charset="utf-8" />
+<meta id="DOI" content="10.1045/may2017-vanhyning" />
+<meta itemprop="datePublished" content="2017-05-15" />
+<meta id="description" content="D-Lib Magazine Article" />
+<meta id="keywords" content="Crowdsourcing, Citizen Humanities, GLAM, Transcription, IMLS" />
+<link href="../../../style/style1.css" rel="stylesheet" type="text/css" />
+
+<title>Transforming Libraries and Archives through Crowdsourcing</title>
+</head>
+
+<body>
+<form action="/cgi-bin/search.cgi" method="get">
+
+<div style="height:2px;background:#2b538e"></div>
+<div style="height:4px;background:#4078b1"></div>
+
+<div style="height:30px;background:#4078b1">
+
+<span style="color: #ffffff; font-size: 12px; float: right; margin-right: 10px;">Search D-Lib:
+<input type="text" id="words" value="" size="25" />
+<input type="submit" id="search" value="Go!" />
+<input type="hidden" id="config" value="htdig" />
+<input type="hidden" id="restrict" value="" />
+<input type="hidden" id="exclude" value="" />
+</span>
+</div>
+
+<div style="height:1px;background:#e04c1e"></div>
+<div style="height:1px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:1px;background:#2b538e"></div>
+<div style="height:92px;background:#4078b1"><img width="450" height="90" alt="D-Lib-blocks5" src="../../../img2/D-Lib-blocks5.gif">
+</div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#e04c1e"></div>
+<div style="height:24px;background:#eda443"><img src="../../../img2/magazine5.gif" alt="The Magazine of Digital Library Research" width="830" height="24" /></div>
+<div style="height:1px;background:#e04c1e"></div>
+<div style="height:28px;background:#2b538e">
+<div id="navtable">
+<table>
+<tr><td class="navtext"><img src="../../../img2/transparent.gif" alt="" width="20" height="20" /><a href="../../../dlib.html">HOME</a>&nbsp;|&nbsp;<a href="../../../about.html">ABOUT D-LIB</a>&nbsp;|&nbsp;<a href="../../../contents.html" class="navtext">CURRENT ISSUE</a>&nbsp;|&nbsp;<a href="../../../back.html">ARCHIVE</a>&nbsp;|&nbsp;<a href="../../../author-index.html">INDEXES</a>&nbsp;|&nbsp;<a href="http://www.dlib.org/groups.html">CALENDAR</a>&nbsp;|&nbsp;<a href="../../author-guidelines.html">AUTHOR GUIDELINES</a>&nbsp;|&nbsp;<a href="http://www.dlib.org/mailman/listinfo/dlib-subscribers">SUBSCRIBE</a>&nbsp;|&nbsp;<a href="../../letters.html">CONTACT D-LIB</a></td></tr></table></div></div>
+<div style="height:4px;background:#2b538e"></div>
+<div style="height:1px;background:#e04c1e"></div>
+
+<div style="padding-left: 2.5em; padding-top: 1em;">
+
+<h3 class="blue-space">D-Lib Magazine</h3>
+<p class="blue">May/June 2017<br />
+Volume 23, Number 5/6<br />
+<a href="../05contents.html">Table of Contents</a>
+</p>
+
+<div class="divider-full">&nbsp;</div>
+
+<h3 class="blue-space">Transforming Libraries and Archives through Crowdsourcing</h3>
+
+<p class="blue">Victoria Van Hyning, University of Oxford, Zooniverse<br />
+victoria [at] zooniverse.org<br /><br />
+
+Samantha Blickhan, The Adler Planetarium, Zooniverse<br />
+samantha [at] zooniverse.org<br /><br />
+
+Laura Trouille, The Adler Planetarium, Zooniverse<br />
+trouille [at] zooniverse.org<br /><br />
+
+Chris Lintott, University of Oxford, Zooniverse<br />
+chris [at] zooniverse.org</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p><a href="https://doi.org/10.1045/may2017-vanhyning" class="nolinka">https://doi.org/10.1045/may2017-vanhyning</a></p>
+
+<div class="divider-full">&nbsp;</div>
+ <!-- Abstract or TOC goes here -->
+
+<h3 class="blue">Abstract</h3>
+
+<p class="blue">This article will showcase the aims and research goals of the project entitled "Transforming Libraries and Archives through Crowdsourcing", recipient of a 2016 Institute for Museum and Library Services grant. This grant will be used to fund the creation of four bespoke text and audio transcription projects which will be hosted on the Zooniverse, the world-leading research crowdsourcing platform. These transcription projects, while supporting the research of four separate institutions, will also function as a means to expand and enhance the Zooniverse platform to better support galleries, libraries, archives and museums (GLAM institutions) in unlocking their data and engaging the public through crowdsourcing.</p>
+
+<p class="blue">Keywords: Crowdsourcing, Citizen Humanities, GLAM, Transcription, IMLS</p>
+
+<!-- Article goes next -->
+
+<div class="divider-full">&nbsp;</div>
+<h3>1 Overview<span style="vertical-align: super;"><a href="#n6">1</a></span></h3>
+
+<p>As libraries, museums, and other cultural repositories digitize their collections and place them online, the challenges of transforming these materials into useful and searchable sources of information are becoming increasingly apparent. While OCR and handwriting recognition technology have opened up some print and manuscript corpora, and image and voice recognition software are improving daily, there are still many tasks that require human intervention. For these, volunteer crowdsourcing is a viable and vibrant solution.</p>
+
+<p>The <a href="https://www.zooniverse.org/">Zooniverse</a> is the world-leading research crowdsourcing platform, hosting over 50 active projects and over 100 projects total since its inception in 2007. The projects cover diverse subject areas from astronomy to zoology, engage over 1.5 million registered volunteers, and have produced data used in more than a hundred peer-reviewed articles.<span style="vertical-align: super;"><a href="#n1">2</a></span> The Zooniverse also hosts the <a href="https://www.zooniverse.org/lab">Project Builder</a>, a free platform through which anyone can build their own project. The Zooniverse grew from a single project developed at the University of Oxford in 2007, and is now developed and managed by a team based in Oxford and at the Adler Planetarium in Chicago and the University of Minnesota (see <a href="https://www.zooniverse.org/about/team">Zooniverse Team</a> for a more complete list).</p>
+
+<p>In late 2016, the Institute for Museum and Library Services awarded a National Leadership Grant titled "Transforming Libraries and Archives through Crowdsourcing (LG-71-16-0028-16)" to the Adler Planetarium and its collaborators to support the work of the Zooniverse. Through this grant-funded effort, the Zooniverse will further expand and enhance its platform to better support galleries, libraries, archives, and museums (GLAM institutions) in unlocking their data and engaging the public through crowdsourcing. </p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>1.1 What Can Crowdsourcing Offer GLAMs?</h4>
+
+<p>In 2010, author and professor Clay Shirky delivered a rousing <a href="https://www.ted.com/talks/clay_shirky_how_cognitive_surplus_will_change_the_world">TED</a> talk in which he used the phrase "cognitive surplus" to describe the one trillion hours of leisure time humans collectively accumulate each year (a great deal of which is spent watching television), which could be harnessed to advance human knowledge through civic engagement. He concluded that: "free cultures get what they celebrate. [...If we] celebrate and support and reward the people trying to use cognitive surplus to create civic value [...] we'll be able to change society".[<a href="#1">1</a>] One way that GLAMs can harness this cognitive surplus is through web-based crowdsourcing. What Shirky was describing was a type of "social machine", which Tim Berners-Lee defined as "new form[s] of social processes" emergent from the Web, and involving both human and machine components.[<a href="#2">2</a>] </p>
+
+<p>Academic crowdsourcing invites members of the public to work with specialists to conduct research: for example, to transcribe documents or add metadata to a collection of images, video or audio clips. This data is used in real science, social science, or humanities investigations and should, ideally, lead to publication. Crowdsourcing within GLAMs may not always be oriented around a specific research question or publication, but around making collections more accessible for future research and usability. GLAM crowdsourcing can be the seedbed of future scholarly research.</p>
+
+<p>GLAMs have been engaging volunteers with their collections for well over a century, usually by inviting select individuals into an institution and training them to do work that cannot be done by staff due to time or money constraints. On-site volunteers often build up valuable knowledge and skills and contribute a great deal to their chosen institutions, but training and supervising them also poses challenges. There is a limit to how many volunteers can be trained, supported on site, and indeed attracted and retained in the first place. Online volunteering, enabled by crowdsourcing platforms such as Zooniverse.org, offer an alternative or complementary form of engagement that has many benefits. Online projects can reach a wider range of individuals, including those who are less able-bodied or geographically remote from the institution in which they want to volunteer and/or unable to travel. Such projects require less training and time commitment from volunteers and typically attract a larger number of participants than on-site programs. They also enable GLAMs to open up rare collections to the public without concern for their material safety and security.<span style="vertical-align: super;"><a href="#n2">3</a></span></p>
+
+<p>While crowdsourcing projects have proliferated in the last decade, few offer easy to use, open source, and free platforms on which GLAM academics and amateur users can rely. The Zooniverse has the infrastructure, community, and technical expertise to intervene at this critical stage. </p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>1.2 How Does The Zooniverse Work?</h4>
+
+<p>All bespoke Zooniverse projects, including those built on the free Project Builder, have a few core components. Each image, audio or video file (data point) in each project is independently assessed by multiple individuals, whose responses are then aggregated using a variety of algorithms to determine what is in a given image. The amount of required responses for a task to be considered "complete" varies, depending on the project. With relatively quick tasks, such as animal identification in Snapshot Serengeti, upwards of 70 people will see each image. In tasks that require more time, such as transcription projects like <a href="https://www.shakespearesworld.org/#!/">Shakespeare's World</a> and <a href="https://anno.tate.org.uk/#!/">AnnoTate</a>, at least three people transcribe each line on each page. If enough people transcribe the same line and our algorithms deem the line to be completed to a good enough standard, these are greyed out, while outstanding lines are available to future site visitors. This approach was designed along the same principles that underpin all other Zooniverse projects, in which it is assumed that volunteers should work independently on tasks, in order that no one individual should have undue influence over others in the crowd. In the current IMLS project, however, we will test whether allowing volunteers to transcribe and work collaboratively ultimately creates better data and/or better user experiences. We will be able to compare datasets from AnnoTate and Shakespeare's World with text transcription datasets from the two new bespoke text transcription projects and, hopefully, with datasets generated at other institutions that have online crowdsourcing projects. Zooniverse is in a unique position in being able to gather these two very different kinds of data and compare them in order to determine the best outcomes. These findings will ultimately drive our design of free tools on the Project Builder.
+
+<p>In addition to participating in the classification task, users have the opportunity to communicate with other volunteers through an active, object-oriented discussion forum, called "Talk", associated with each project. Here volunteers can ask questions, interact with researchers and fellow volunteers, create their own "collections", and use hashtags to group together posts or images of interest. An example of the latter is <a href="https://talk.sciencegossip.org/#/search?tags%5Bfemale%5D=true">#female</a> from the <a href="https://www.sciencegossip.org/">Science Gossip</a> project, which indicates female authors, illustrators and printers contributing to the main scientific journals in the nineteenth century (visit the <a href="https://talk.sciencegossip.org/#/boards/BSC0000004/discussions/DSC00004s8">Science Gossip Talk</a> board to view the discussion around this tag). These interactions provide a rich set of experiences that allow users to personally experience the community in which they are participating, beyond simply providing classifications. Additionally, the collections allow volunteers to create their own research focal points within existing projects. During the process of transcribing, users can save images that contain content that is pertinent to their research interests by adding them to a public collection. They can then use the Talk forum to publicize their search, allowing other users to add images to that collection as well. In this way, the volunteer base can be mobilized to help other volunteers with minimal effort required.</p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>2 IMLS Funded Effort: Approach and Focus</h3>
+
+<p>Through the IMLS grant, the Zooniverse will engage in a research and development program to identify and implement crowdsourcing best practices in the arenas of text and audio transcription for the purposes of unlocking big data currently trapped in GLAM sources that cannot be machine read. Though to date the majority of Zooniverse projects have been based in STEM fields rather than in the humanities, several text transcription projects have already been hosted on the site. For example, the first Zooniverse humanities project was <a href="https://www.ancientlives.org/">Ancient Lives</a>, which invited volunteers to transcribe ancient papyri one letter at a time using a clickable keyboard on their screen: volunteers did not have to be fluent in ancient Greek, they only needed to character match. Over 250,000 volunteers participated in the project, and made more than 1.5 million transcriptions between 2011 and 2014.[<a href="#6">3</a>] Furthermore, the computational pipeline used to convert individual identified letters into consensus-based transcriptions will benefit future classification projects attempting consensus letter or line sequence identifications.[<a href="#7">4</a>]</p>
+
+<p>By 2018 we will build four bespoke projects, two projects for text transcription and two projects for audio transcription, identified through open calls, in order to test, iterate, and research the efficacy of new and existing approaches (including within current Zooniverse and other projects) in these arenas. We will also develop the foundation for a GLAM-friendly data pipeline to export data from a Zooniverse project into GLAM collections. These functionalities are among those most frequently requested by GLAM institutions. We will work closely with four different GLAM institutions to build these bespoke crowdsourcing projects and functionalities. The text transcription open call closed in February 2017, with thirty-one submissions. The audio transcription open call will occur in fall 2017 (see <a href="http://zooniverse.org/get-involved/call-for-projects">Call for Projects</a>).</p>
+
+<p>From the lessons learned in building these bespoke projects, we will explore adding new tools and functionality to the Project Builder, which is freely available to any institution or user who wishes to lead a project. It is a flexible, powerful, and easy-to-use resource for building crowdsourcing projects, with a wide range of potential applications for GLAM collections, including text transcription. A basic text transcription tool is currently available, but will be refined through this grant effort. The Zooniverse has previously used this model of building bespoke projects in order to learn which tools are most useful, before implementing these tools in the Project Builder. We recognize that volunteers' time is precious, and are therefore unwilling to waste it with tools that are not proven to extract data in an efficient, high quality, and useful form. We will also draw on lessons learned from previous experiences supporting transcription projects through Zooniverse and other platforms. For example, <a href="https://www.operationwardiary.org/">Operation War Diary</a> which launched in 2014 to commemorate the outbreak of the First World War, is a partnership between the National Archives (UK), the Imperial War Museum, and the Zooniverse, which invites users to tag and transcribe dates, times, places, and names found in British WWI field diaries. Historian Richard Grayson has used the data to penetrate more deeply than ever before into records of soldiers' daily lives on the front.[<a href="#8">5</a>] All of the Operation War Diary metadata will eventually be integrated into the National Archive catalogues. The process of integrating new metadata into an existing catalogue can be complicated, raising an important question for any GLAM specialist seeking to harness crowdsourcing at their institution. For instance, it is essential to ensure, before starting a project, that the current content management system (CMS) supports the storage of additional metadata, such as large amounts of free-text. If not, it then becomes necessary to use an external resource to make available the results from the crowdsourcing project. Zooniverse can and will do more to facilitate GLAMs and research groups to use and store their data.</p>
+
+<p>Over the course of the IMLS project, we will also address the following research questions:</p>
+
+<p class="indentLeft">Q1: How can crowdsourcing be deployed in the arenas of text and audio transcription and metadata extraction for the purposes of unlocking big data currently trapped in GLAM sources that cannot be machine read? What methods produce the best data and make for the best user experience?</p>
+
+<p class="indentLeft">Q2: Does the current Zooniverse methodology of multiple independent transcribers and aggregation render better results than allowing volunteers to see previous transcriptions by others or indeed collaborate to create a single transcription? How does each methodology impact the quality of data, as well as depth of analysis and participation?</p>
+
+<p class="indentLeft">Q3: How can we extend our crowdsourcing expertise to more GLAM professionals and learn from them, in turn, how to adjust the Zooniverse platform to best meet their research and curatorial needs?</p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.1 Addressing Q1 (Crowdsourcing for GLAM)</h4>
+
+<p>Only a platform like the Zooniverse can systematically address a question such as Q1: the community that has developed within the platform is made up of volunteers who move across projects, allowing us to trace the impact of differences between projects on the same volunteers. Zooniverse also has the infrastructure to implement A/B split experiments within a single project. This allows us to develop projects incorporating different practices which are specifically aimed at understanding different methodologies. Through the bespoke text and audio transcription projects, we will expand on the lessons learned through current Zooniverse text transcription projects, including Ancient Lives, AnnoTate, Old Weather, Measuring the ANZACs, Shakespeare's World, Science Gossip, Decoding the Civil War, Orchid Observers and Operation War Diary, as well as from external text transcription projects including <a href="http://blogs.ucl.ac.uk/transcribe-bentham/">Transcribe Bentham</a>, <a href="http://fromthepage.com/">FromthePage</a>, and <a href="http://scripto.org/">Scripto</a>. </p>
+
+<p>In the bespoke projects created through the IMLS grant, the features optimizing volunteer engagement and retention will include: </p>
+
+<ul>
+ <li><i>Volunteer choice:</i> volunteers choose which document to transcribe and can transcribe as little as a single line or as much as an entire document. We have found through AnnoTate and Shakespeare's World that allowing users to transcribe smaller fragments of text (without being required to complete an entire page) mitigates against forced or uncertain readings. We hypothesize and plan to fully test whether allowing microtasking helps to retain volunteers, giving them the chance to build up their skills and not make forced readings. </li>
+
+ <li><i>Keeping the task simple:</i> in Shakespeare's World and AnnoTate, volunteers drop points at the start and end of individual lines of text (not grammatical sentences) and transcribe the text contained between these two points. They do not use XML markup itself, which has proven to be a major repellent to participants in other text transcription crowdsourcing projects.<span style="vertical-align: super;"><a href="#n3">4</a></span> Instead, volunteers highlight words within the transcribed line and choose among different features (e.g., insertion, deletion, expansion, etc.). We propose to use these tagged words in each line to create simple TEI markup on the back-end, for output into commonly used CMSs such as Drupal and Omeka.</li>
+
+ <li><i>Narrowing the content focus to support sense-making:</i> In Shakespeare's World, the first release (or "chapter") consists of recipes and letters, with more genres to follow. This type of structured approach will be applied to the bespoke projects, as this supports creation of narratives within diverse collections, which in turn enables subject experts to more easily foster, and volunteers to contribute to, discussions in Talk.</li>
+</ul>
+
+<p>Features optimizing best practice in regard to data production and management will include:</p>
+
+<ul>
+ <li><i>Reliable, Scalable, Open Source Code Infrastructure:</i> The foundation for the Zooniverse platform that includes the Project Builder is an application written in Ruby on Rails which supports a powerful Application Programming Interface (API). The API serves subjects &#151; images, video or audio &#151; for classification by volunteers via a workflow defined by the project, and receives and records these classifications into a database. The frontend Javascript web software presents user interfaces to volunteers and supports the Project Builder. All Zooniverse code is open source and available through <a href="github.com/zooniverse">Github</a>.</li>
+
+ <li><i>Data Ingestion into Zooniverse:</i> In the current Project Builder, research teams can upload batches of 500 to 1000 subjects (images, videos, or audio clips) at a time by simply dragging and dropping the files. For larger collections and for bespoke projects, typically the research team provides a hard drive and the Zooniverse team uploads the subjects to the API. Through the projects proposed here, we will create a system to better support direct ingestion of large subject sets through a user-friendly web interface, adding functionality to the foundation we already have in place within the Project Builder.</li>
+
+ <li><i>Useful Output for Curation:</i> The Smithsonian Transcription Center is regularly cited as being successful in regard to their output being easily ingestible by CMSs.[<a href="#9">6</a>] Current Zooniverse transcription projects are not set up with this functionality. Currently, through our Project Builder for image annotation/marking projects, research teams can download the raw classification results (i.e. all classifications by all volunteers) as well as automatically-generated aggregated results that include confidence measures on consensus. Through this IMLS-funded effort, we will work with Meghan Ferriter of the Smithsonian Transcription Center, who is on our board of advisors, to design data outputs for full text transcription and full audio transcription that are suitable for ingestion into different GLAM CMSs. A key aspect of this effort is to continue exploring best practices and approaches for transcription aggregation and confidence metrics, building on our efforts with AnnoTate, Shakespeare's World, etc.</li>
+</ul>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.2 Addressing Research Q2 (Independent vs. Collaborative Transcription)</h4>
+
+<p>Through the two bespoke text transcription projects, we will investigate the impact on transcription quality and volunteer experience when volunteers transcribe in isolation versus with knowledge of how others have transcribed the same document. </p>
+
+<p>In terms of measuring impact on transcription quality, we will compare the rate of accuracy for individuals who transcribe in isolation on projects such as AnnoTate and Shakespeare's World versus individuals who see previous transcriptions. We will also compare the rate of accuracy in aggregated results for lines transcribed only by those working in isolation versus for lines in which all but the first transcriber sees previous transcriptions. In order to measure impact on volunteer experience, we will analyze the user behavior statistics we gather, e.g., number of transcriptions completed in a given session, length of session, number of sessions overall, sentiment analysis of discussion forum comments, etc.</p>
+
+<p>There are numerous open questions in this experiment: Does knowledge of other individuals' or collective transcriptions lead individuals down the wrong path? Is transcription more or less accurate if people work in isolation or with an awareness of other people's work? Does making transcriptions visible increase retention as a result of highlighting that an individual's effort is part of a broader community effort or have the opposite effect? What environment best promotes skills acquisition, i.e. improved paleography?</p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.3 Addressing Research Q3 (Feedback/Training)</h4>
+
+<p>We will provide numerous opportunities for input and feedback from and training for the GLAM community, specifically by working closely with our advisory board and four GLAM project partners throughout. In 2018 we will host feedback sessions at GLAM conferences and summer schools targeting GLAM institutions with collections for which text transcription, audio transcription, or image annotation/marking are of interest (we will include image annotation/marking because those tools are already included via the Project Builder). This will allow for input from a broader set of institutions on our decisions and approach for building new functionality into the Project Builder. In 2018&#151;2019 we will host training workshops for GLAM professionals in using the Project Builder to build their own crowdsourcing projects, incorporate the results into their databases and research, and sustain and nurture their online volunteer communities.</p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>3 Future Steps: Community Engagement, Output &amp; How to Get Involved</h3>
+
+<p>The IMLS-Funded Project "Transforming Libraries and Archives through Crowdsourcing" is still in its beginning stages. Currently, we are in the process of selecting the first two bespoke crowdsourcing text transcription projects to be built and incorporated into the Zooniverse platform. The detail of our research questions will evolve alongside these new transcription projects, and during the research and development process we will use conference presentations and feedback sessions to gather input which can then guide the overall project design. The open call for the two bespoke audio transcription projects will occur in the fall of 2017. At this point, the bespoke text transcriptions will be in beta review, allowing us to take advantage of lessons learned through that first round of new projects. We believe that this self-reflexive method will simultaneously benefit our ongoing project while offering new tools and ideas to the larger GLAM and academic community.</p>
+
+<p>We anticipate this proposed effort will produce two peer-reviewed publications. One article will focus on the methodology for creating, processing, and evaluating the data produced by the new projects. The second will focus on the results of our research exploring the impact of individual versus collaborative text transcription. We also note that all Zooniverse <a href="github.com/zooniverse">code</a> is freely available under a liberal open source license which serves as an additional or parallel form of publication.</p>
+
+<p>GLAM organizations keen to develop their own crowdsourcing projects should explore the available documentation on <a href="https://www.zooniverse.org/lab-how-to">how to build a project</a> and <a href="https://www.zooniverse.org/lab-best-practices/great-project">best practices for the design, launch and long term phases of a project</a>. While building a project is easy and requires relatively little technical support from Zooniverse or your institution, make sure you have the time to work with your resulting data, and time to support your online volunteer commmunity. Advertising the project's existence should be a long-term task, to avoid a plateau or potential drop-off of user participation. For example, Shakespeare's World received a bump in the number of daily classifications after an article was published in The New Yorker in January of 2017, over a year after the project's launch date.[<a href="#10">7</a>] However, it does not suffice to merely advertise the existence of a project; researchers need to engage with their users on a regular basis.<span style="vertical-align: super;"><a href="#n5">5</a></span> Zooniverse's Talk platform, social media such as blogging, Twitter, Instagram, and indeed in-person or on-site events all provide important channels for engaging current or potential volunteers with your collections. We believe that GLAM organizations, with their long history of volunteer engagement, have many of the skills to work effectively with online volunteers, and will benefit in new ways through cooperation with the crowd.</p>
+
+<p>In conclusion, while this project is specifically focused on text and audio transcription, it is our hope that the results, including the new Project Builder tools and GLAM data pipeline, will ultimately be used across a variety of disciplines and domains. We hope to facilitate future partnerships between GLAM institutions and volunteer communities around the world, thus extending the aims and outcomes of the National Digital Platform funded through this generous IMLS grant into an international digital platform that will benefit many individuals and institutions. </p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>Notes</h3>
+
+<table style="width:90%">
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n6">1</a></td>
+<td style="padding-top: .5em;">Part of this article appeared previously as a blog post for CILIP, The Library and Information Association. Material is reproduced by express permission of CILIP.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n1">2</a></td>
+<td style="padding-top: .5em;">For a partial list of publications, please visit <a href="https://www.zooniverse.org/about/publications">https://www.zooniverse.org/about/publications</a>. </td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n2">3</a></td>
+<td style="padding-top: .5em;">Further discussion of the use of crowdsourcing in GLAM contexts can be found in Melissa Terras, "Crowdsourcing in the Digital Humanities", in <i>A New Companion to Digital Humanities</i>, eds. Susan Schreibman, Ray Siemens, and John Unsworth (John Wiley &amp; Sons, 2016), 420-438, particularly in the section entitled "The Growth of Crowdsourcing in Cultural and Heritage Applications" (pp. 423-28). See also <i>Crowdsourcing Our Cultural Heritage</i>, ed. Mia Ridge (Ashgate, 2014).</td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n3">4</a></td>
+<td style="padding-top: .5em;">Causer and Terras, "Many Hands Make Light Work", p. 81: "It would be fair to say that for volunteers, the XML mark-up complicates participation, and it has undoubtedly dissuaded many from participating more fully, or at all." For opinions from the volunteers about the process, the authors additionally refer the reader to Causer and Valerie Wallace, "<a href="http://www.digitalhumanities.org/dhq/vol/6/2/000125/000125.html">Building a Volunteer Community: Results and Findings from Transcribe Bentham</a>", <i>Digital Humanities Quarterly</i> 6.2 (2012).</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n5">5</a></td>
+<td style="padding-top: .5em;">Or, as Zephyr Frank, <i>et al</i>. put it: "Paid advertising can generate large numbers of clicks on a website. It cannot, however, produce good metadata or newly uploaded material that is relevant to the scholarly questions posed by academic researchers." "<a href="https://github.com/cestastanford/crowdsourcing/raw/master/files/Mellon%20White%20Paper.pdf">Crowdsourcing for Humanities Research</a>" (2016) Project White Paper. </td>
+</tr>
+</table>
+
+<div class="divider-white">&nbsp;</div>
+<div class="divider-full">&nbsp;</div>
+<h3>References</h3>
+
+<table style="width:90%">
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="1">[1]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Clay Shirky, "<a href="https://www.ted.com/talks/clay_shirky_how_cognitive_surplus_will_change_the_world">How Cognitive Surplus Will Change the World</a>", June 2010.</td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="2">[2]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Tim Berners-Lee with Mark Fischetti, <i>Weaving the Web: The Original Design and Ultimate Destiny of the World Wide Web by its Inventor</i> (San Francisco: Harper, 1999).</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="6">[3]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">"P.Oxy 5156, Plutarch Moralia 660C, 661B-C (Quaestiones Convivales IV PR., 1.2)", in <i>The Oxyrhynchus Papyri</i>, R.-L. Chang <i>et al</i>., eds, vol. 78 (London, Egypt Exploration Society, 2012), 97-98. </td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="7">[4]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Alex C. Williams <i>et al.</i>, "A Computational Pipeline for Crowdsourced Transcriptions of Ancient Greek Papyrus Fragments", in <i>IEEE International Conference on Big Data</i>, October 2014. <a href="https://doi.org/10.1109/BigData.2014.7004460">https://doi.org/10.1109/BigData.2014.7004460</a></td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="8">[5]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Richard Grayson, "A Life in the Trenches? The Use of Operation War Diary and Crowdsourcing Methods to Provide an Understanding of the British Army's Day-to-Day Life on the Western Front", <i>British Journal for Military History,</i> 2.2 (2016), 160-85.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="9">[6]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Katie Mika, "<a href="http://library.mcz.harvard.edu/blog/transcription-tools-survey-katie-mika-ndsr-resident">Transcription Tools: a survey by Katie Mika, NDSR Resident</a>", Harvard University, Ernst Mayr Library Blog.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="10">[7]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Roberta Kwok, "<a href="http://www.newyorker.com/tech/elements/crowdsourcing-for-shakespeare">Crowdsourcing For Shakespeare</a>", <i>The New Yorker</i>, 16 Jan. 2017. </td>
+</tr>
+</table>
+
+<div class="divider-white">&nbsp;</div>
+<div class="divider-full">&nbsp;</div>
+<h3>About the Authors</h3>
+
+<p class="blue"><b>Victoria Van Hyning</b> is a Junior Research Fellow at Pembroke College, and a British Academy Postdoctoral Fellow. Her current project, 'Court to Convent: Early Modern English Catholic Women's Autobiography', will reveal how Catholic women articulated selfhood in the period when it was illegal to practice Catholicism, 1535 to 1829. She is also the Humanities PI of Zooniverse.org, the world leading academic crowdsourcing organization. Her projects include <a href="https://www.sciencegossip.org">Science Gossip</a>, <a href="http://www.shakespearesworld.org">Shakespeare's World</a> and <a href="https://anno.tate.org.uk">AnnoTate</a>.</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue"><b>Samantha Blickhan</b> is the IMLS Postdoctoral Fellow in the Department of Citizen Science at the Adler Planetarium, working on transcription projects for the Zooniverse. She received her Ph.D. in Musicology from Royal Holloway, University of London, with a thesis on the palaeography of British song notation in the 12th and 13th centuries. Her research interests include music and perception, and their relationships with writing systems, technology and pedagogy.</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue"><b>Laura Trouille</b> is co-Investigator for Zooniverse and Director of Citizen Science at the Adler Planetarium where she leads the Zooniverse web development and Teen Programs teams. While earning her Ph.D. in astronomy in 2010 studying galaxy evolution, she also earned the Center for the Integration of Research, Teaching and Learning's Delta certificate for STEM education research. As a CIERA Postdoctoral Fellow at Northwestern University's CIERA Center for Astrophysics, she continued her research on active galaxies as well as co-led the Computational Thinking in STEM project, bringing computational thinking and modeling curricular materials to high school science and math teachers. </p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue">Chris Lintott is a professor of astrophysics at the University of Oxford, where he is also a research fellow at New College. He is the principle investigator for Galaxy Zoo and the Zooniverse, and his own research focuses on novel modes of crowdsourcing for anomaly detection.</p>
+
+<div class="divider-full">&nbsp;</div>
+
+ <!-- Standard Copyright line here -->
+
+<div class="center">
+<p class="footer">Copyright &reg; 2017 Victoria Van Hyning, Samantha Blickhan, Laura Trouille and Chris Lintott</p>
+</div>
+
+<div style="height:1px;background:#2b538e"></div>
+
+</div>
+</form>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/elife_article.html b/python/tests/files/elife_article.html
new file mode 100644
index 0000000..7aa1361
--- /dev/null
+++ b/python/tests/files/elife_article.html
@@ -0,0 +1,3094 @@
+<!doctype html>
+
+<html lang="en" prefix="og: http://ogp.me/ns#">
+
+<head>
+
+ <meta charset="utf-8">
+
+ <title>Parallel visual circuitry in a basal chordate | eLife</title>
+
+ <meta name="viewport" content="width=device-width,initial-scale=1,shrink-to-fit=no">
+
+ <meta name="format-detection" content="telephone=no">
+
+
+ <link rel="apple-touch-icon" sizes="57x57" href="/assets/favicons/apple-touch-icon-57x57.4aeffd56.png">
+ <link rel="apple-touch-icon" sizes="60x60" href="/assets/favicons/apple-touch-icon-60x60.91474092.png">
+ <link rel="apple-touch-icon" sizes="72x72" href="/assets/favicons/apple-touch-icon-72x72.95fa9e7b.png">
+ <link rel="apple-touch-icon" sizes="76x76" href="/assets/favicons/apple-touch-icon-76x76.a4c54393.png">
+ <link rel="apple-touch-icon" sizes="114x114" href="/assets/favicons/apple-touch-icon-114x114.a8199d6e.png">
+ <link rel="apple-touch-icon" sizes="120x120" href="/assets/favicons/apple-touch-icon-120x120.efde6c5c.png">
+ <link rel="apple-touch-icon" sizes="144x144" href="/assets/favicons/apple-touch-icon-144x144.457f5c5e.png">
+ <link rel="apple-touch-icon" sizes="152x152" href="/assets/favicons/apple-touch-icon-152x152.5aea1932.png">
+ <link rel="apple-touch-icon" sizes="180x180" href="/assets/favicons/apple-touch-icon-180x180.21337439.png">
+ <link rel="icon" type="image/svg+xml" href="/assets/favicons/favicon.ee498e7d.svg">
+ <link rel="icon" type="image/png" sizes="32x32" href="/assets/favicons/favicon-32x32.825ee0ea.png">
+ <link rel="icon" type="image/png" sizes="192x192" href="/assets/favicons/android-chrome-192x192.365fe68b.png">
+ <link rel="icon" type="image/png" sizes="16x16" href="/assets/favicons/favicon-16x16.337f389b.png">
+ <link rel="shortcut icon" href="/assets/favicons/favicon.a755add0.ico">
+ <link rel="manifest" href="/assets/favicons/manifest.cff74b51.json">
+ <meta name="theme-color" content="#ffffff">
+ <meta name="application-name" content="eLife">
+
+
+
+
+
+
+ <meta name="dc.format" content="text/html">
+ <meta name="dc.language" content="en">
+ <meta name="dc.publisher" content="eLife Sciences Publications Limited">
+
+ <meta name="dc.title" content="Parallel visual circuitry in a basal chordate">
+
+ <meta name="dc.identifier" content="doi:10.7554/eLife.44753">
+
+ <meta name="dc.date" content="2019-04-18">
+
+ <meta name="dc.rights" content="© 2019 Kourakis et al.. This article is distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use and redistribution provided that the original author and source are credited.">
+
+
+
+ <meta name="dc.contributor" content="Matthew J Kourakis">
+ <meta name="dc.contributor" content="Cezar Borba">
+ <meta name="dc.contributor" content="Angela Zhang">
+ <meta name="dc.contributor" content="Erin Newman-Smith">
+ <meta name="dc.contributor" content="Priscilla Salas">
+ <meta name="dc.contributor" content="B Manjunath">
+ <meta name="dc.contributor" content="William C Smith">
+
+
+
+ <meta property="og:site_name" content="eLife">
+ <meta property="og:url" content="https://elifesciences.org/articles/44753">
+ <meta property="og:title" content="Parallel visual circuitry in a basal chordate">
+ <meta name="twitter:site" content="@eLife">
+
+ <meta property="og:description" content="The ascidian Ciona integrates visual information from two photoreceptor types through convergent excitatory and disinhibitory circuits, thereby evoking swim behaviors.">
+ <meta name="description" content="The ascidian Ciona integrates visual information from two photoreceptor types through convergent excitatory and disinhibitory circuits, thereby evoking swim behaviors.">
+
+ <meta name="twitter:card" content="summary">
+
+ <meta property="og:type" content="article">
+
+ <link rel="canonical" href="/articles/44753">
+
+
+
+
+
+
+
+
+
+ <!--[if lt IE 9]>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv.min.js"></script>
+ <![endif]-->
+
+ <script>
+ window.gtmDataLayer = window.gtmDataLayer || [];
+
+ window.gtmDataLayer.push(
+ {
+ 'articleSubjects': 'Neuroscience',
+ 'articleType': 'Research Article',
+ 'articlePublishDate': 'Apr 18, 2019'
+ }
+ );
+
+ (function (w, d, s, l, i) {
+ w[l] = w[l] || [];
+ w[l].push({
+ 'gtm.start': new Date().getTime(), event: 'gtm.js'
+ });
+ var f = d.getElementsByTagName(s)[0],
+ j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : '';
+ j.async = true;
+ j.src =
+ 'https://www.googletagmanager.com/gtm.js?id=' + i + dl;
+ f.parentNode.insertBefore(j, f);
+ })(window, document, 'script', 'gtmDataLayer', 'GTM-WVM8KG');
+ </script>
+
+
+</head>
+
+<body>
+
+ <noscript>
+ <iframe src="https://www.googletagmanager.com/ns.html?id=GTM-WVM8KG" height="0" width="0"
+ style="display:none; visibility:hidden"></iframe>
+ </noscript>
+
+ <div class="global-wrapper" data-behaviour=" CookieOverlay FragmentHandler Math HypothesisLoader"
+ data-item-type="research-article"
+ >
+
+ <div class="global-inner">
+
+ <div class="wrapper wrapper--site-header">
+ <header class="site-header clearfix" data-behaviour="SiteHeader" id="siteHeader">
+ <div class="site-header__title clearfix" role="banner">
+ <div class="site-header__skip_to_content">
+ <a href="#maincontent" class="site-header__skip_to_content__link button button--default">Skip to Content</a>
+ </div>
+ <a href="/" class="site-header__logo_link">
+ <picture class="site-header__logo_link_image">
+ <source srcset="/assets/patterns/img/patterns/organisms/elife-logo-full.b1283c9a.svg" type="image/svg+xml" media="(min-width: 45.625em)">
+ <source srcset="/assets/patterns/img/patterns/organisms/elife-logo-symbol.6f18db13.svg" type="image/svg+xml">
+ <img src="/assets/patterns/img/patterns/organisms/elife-logo-full-1x.ce3f6342.png" alt="eLife logo" class="site-header__logo_link"/>
+ </picture>
+ <span class="visuallyhidden" >eLife home page</span>
+ </a>
+ </div>
+ <div class="site-header__navigation" role="navigation" aria-label="Main navigation">
+
+ <nav class="nav-secondary">
+ <ul class="nav-secondary__list clearfix">
+ <li class="nav-secondary__item nav-secondary__item--first">
+
+
+
+
+ <a href="/about">
+
+ About
+ </a>
+
+
+ </li>
+ <li class="nav-secondary__item">
+
+
+
+
+ <a href="/community">
+
+ Community
+ </a>
+
+
+ </li>
+ <li class="nav-secondary__item nav-secondary__item--hide-narrow">
+
+
+
+ <a href="https://reviewer.elifesciences.org/login" class="button button--extra-small button--default" id="submitResearchButton">Submit my research</a>
+
+
+
+ </li>
+ <li class="nav-secondary__item nav-secondary__item--last">
+
+
+ <div class="login-control"
+
+
+ data-behaviour="LoginControl">
+
+
+ <a href="/log-in" class="button button--login" >Log in/Register<span class="visuallyhidden"> (via ORCID - An ORCID is a persistent digital identifier for researchers)</span></a>
+
+ </div>
+
+
+ </li>
+ </ul>
+ </nav>
+
+ <nav class="nav-primary">
+ <ul class="nav-primary__list clearfix">
+ <li class="nav-primary__item nav-primary__item--first">
+
+
+
+
+ <a href="#mainMenu">
+ <picture class="nav-primary__menu_icon">
+
+
+
+ <source srcset="/assets/patterns/img/patterns/molecules/nav-primary-menu-ic.ac4e582f.svg"
+ type="image/svg+xml"
+ >
+
+
+
+ <img srcset="/assets/patterns/img/patterns/molecules/nav-primary-menu-ic_2x.8722f6c7.png 2x, /assets/patterns/img/patterns/molecules/nav-primary-menu-ic_1x.8efd68cc.png 1x"
+ src="/assets/patterns/img/patterns/molecules/nav-primary-menu-ic_1x.8efd68cc.png"
+
+ alt="" />
+
+
+ </picture>
+
+
+ <span class="visuallyhidden nav-primary__menu_text"> Menu </span>
+
+ </a>
+
+
+ </li>
+ <li class="nav-primary__item">
+
+
+
+
+ <a href="/">
+
+ Home
+ </a>
+
+
+ </li>
+ <li class="nav-primary__item">
+
+
+
+
+ <a href="/magazine">
+
+ Magazine
+ </a>
+
+
+ </li>
+ <li class="nav-primary__item">
+
+
+
+
+ <a href="/labs">
+
+ Innovation
+ </a>
+
+
+ </li>
+ <li class="nav-primary__item nav-primary__item--last nav-primary__item--search">
+
+
+
+
+ <a href="/search" rel="search">
+ <picture class="nav-primary__search_icon">
+
+
+
+ <source srcset="/assets/patterns/img/patterns/molecules/nav-primary-search-ic.350bcf38.svg"
+ type="image/svg+xml"
+ >
+
+
+
+ <img srcset="/assets/patterns/img/patterns/molecules/nav-primary-search-ic_2x.0635c16f.png 2x, /assets/patterns/img/patterns/molecules/nav-primary-search-ic_1x.8e357583.png 1x"
+ src="/assets/patterns/img/patterns/molecules/nav-primary-search-ic_1x.8e357583.png"
+
+ alt="" />
+
+
+ </picture>
+
+
+ <span class="visuallyhidden nav-primary__menu_text"> Search the eLife site </span>
+
+ </a>
+
+
+ </li>
+ </ul>
+ </nav>
+
+ </div>
+
+
+ <div class="search-box" data-behaviour="SearchBox">
+ <div class="search-box__inner">
+ <form class="compact-form" id="search" action="/search" method="GET" novalidate>
+ <fieldset class="compact-form__container">
+ <label>
+ <span class="visuallyhidden">Search by keyword or author</span>
+ <input type="search" name="for" value="" placeholder="Search by keyword or author"
+
+ class="compact-form__input"
+
+ >
+ </label>
+
+
+ <button type="reset" name="reset" class="compact-form__reset"><span class="visuallyhidden">Reset form</span></button>
+ <button type="submit" class="compact-form__submit"><span class="visuallyhidden">Search</span></button>
+ </fieldset>
+ </form>
+
+ <label class="search-box__search_option_label">
+ <input type="checkbox" name="subjects[]" value="neuroscience" form="search">Limit my search to Neuroscience
+ </label>
+
+ </div>
+ </div>
+
+</header>
+
+ </div>
+
+
+
+ <main role="main" class="main" id="maincontent">
+
+
+ <header
+ class="content-header wrapper content-header--header content-header--has-social-media-sharers clearfix"
+ data-behaviour="ContentHeader">
+
+
+
+ <ol class="content-header__subject_list">
+ <li class="content-header__subject_list_item">
+ <a href="/subjects/neuroscience" class="content-header__subject_link">
+ <span class="content-header__subject">Neuroscience</span>
+ </a>
+ </li>
+ </ol>
+
+ <ul class="content-header__icons">
+ <li><a href="https://en.wikipedia.org/wiki/Open_access"
+ class="content-header__icon content-header__icon--oa"><span
+ class="visuallyhidden">Open access</span></a></li>
+ <li><a href="https://creativecommons.org/licenses/by/4.0/"
+ class="content-header__icon content-header__icon--cc"><span
+ class="visuallyhidden">Copyright information</span></a></li>
+ </ul>
+
+ <a href="#downloads" class="content-header__download_link">
+ <picture>
+ <source srcset="/assets/patterns/img/icons/download-full.6691999e.svg" type="image/svg+xml" media="(min-width: 45.625em)">
+ <source srcset="/assets/patterns/img/icons/download-full-2x.a54fbeb0.png" type="image/png" media="(min-width: 45.625em)">
+ <source srcset="/assets/patterns/img/icons/download.ecfa2d98.svg" type="image/svg+xml">
+ <img src="/assets/patterns/img/icons/download-full-1x.5485093b.png" class="content-header__download_icon" alt="Download icon">
+ </picture>
+ </a>
+
+ <div class="content-header__body">
+ <h1 class="content-header__title content-header__title--short">Parallel visual circuitry in a basal chordate</h1>
+
+
+ <div class="social-media-sharers">
+
+
+ <a class="social-media-sharer" href="https://facebook.com/sharer/sharer.php?u=https%3A%2F%2Fdoi.org%2F10.7554%2FeLife.44753" target="_blank" rel="noopener noreferrer" aria-label="Share on Facebook">
+ <div class="social-media-sharer__icon_wrapper social-media-sharer__icon_wrapper--facebook social-media-sharer__icon_wrapper--small"><div aria-hidden="true" class="social-media-sharer__icon social-media-sharer__icon--solid">
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M18.77 7.46H14.5v-1.9c0-.9.6-1.1 1-1.1h3V.5h-4.33C10.24.5 9.5 3.44 9.5 5.32v2.15h-3v4h3v12h5v-12h3.85l.42-4z"/></svg>
+ </div>
+ </div>
+ </a>
+
+ <a class="social-media-sharer" href="https://twitter.com/intent/tweet/?text=Parallel%20visual%20circuitry%20in%20a%20basal%20chordate&amp;url=https%3A%2F%2Fdoi.org%2F10.7554%2FeLife.44753" target="_blank" rel="noopener noreferrer" aria-label="Tweet a link to this page">
+ <div class="social-media-sharer__icon_wrapper social-media-sharer__icon_wrapper--twitter social-media-sharer__icon_wrapper--small"><div aria-hidden="true" class="social-media-sharer__icon social-media-sharer__icon--solid">
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M23.44 4.83c-.8.37-1.5.38-2.22.02.93-.56.98-.96 1.32-2.02-.88.52-1.86.9-2.9 1.1-.82-.88-2-1.43-3.3-1.43-2.5 0-4.55 2.04-4.55 4.54 0 .36.03.7.1 1.04-3.77-.2-7.12-2-9.36-4.75-.4.67-.6 1.45-.6 2.3 0 1.56.8 2.95 2 3.77-.74-.03-1.44-.23-2.05-.57v.06c0 2.2 1.56 4.03 3.64 4.44-.67.2-1.37.2-2.06.08.58 1.8 2.26 3.12 4.25 3.16C5.78 18.1 3.37 18.74 1 18.46c2 1.3 4.4 2.04 6.97 2.04 8.35 0 12.92-6.92 12.92-12.93 0-.2 0-.4-.02-.6.9-.63 1.96-1.22 2.56-2.14z"/></svg>
+ </div>
+ </div>
+ </a>
+
+ <a class="social-media-sharer" href="mailto:?subject=Parallel%20visual%20circuitry%20in%20a%20basal%20chordate&amp;body=https%3A%2F%2Fdoi.org%2F10.7554%2FeLife.44753" target="_self" aria-label="Email a link to this page (opens up email program, if configured on this system)">
+ <div class="social-media-sharer__icon_wrapper social-media-sharer__icon_wrapper--email social-media-sharer__icon_wrapper--small"><div aria-hidden="true" class="social-media-sharer__icon social-media-sharer__icon--solid">
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M22 4H2C.9 4 0 4.9 0 6v12c0 1.1.9 2 2 2h20c1.1 0 2-.9 2-2V6c0-1.1-.9-2-2-2zM7.25 14.43l-3.5 2c-.08.05-.17.07-.25.07-.17 0-.34-.1-.43-.25-.14-.24-.06-.55.18-.68l3.5-2c.24-.14.55-.06.68.18.14.24.06.55-.18.68zm4.75.07c-.1 0-.2-.03-.27-.08l-8.5-5.5c-.23-.15-.3-.46-.15-.7.15-.22.46-.3.7-.14L12 13.4l8.23-5.32c.23-.15.54-.08.7.15.14.23.07.54-.16.7l-8.5 5.5c-.08.04-.17.07-.27.07zm8.93 1.75c-.1.16-.26.25-.43.25-.08 0-.17-.02-.25-.07l-3.5-2c-.24-.13-.32-.44-.18-.68s.44-.32.68-.18l3.5 2c.24.13.32.44.18.68z"/></svg>
+ </div>
+ </div>
+ </a>
+
+ <a class="social-media-sharer" href="https://reddit.com/submit/?title=Parallel%20visual%20circuitry%20in%20a%20basal%20chordate&amp;url=https%3A%2F%2Fdoi.org%2F10.7554%2FeLife.44753" target="_blank" rel="noopener noreferrer" aria-label="Share this page on Reddit">
+ <div class="social-media-sharer__icon_wrapper social-media-sharer__icon_wrapper--reddit social-media-sharer__icon_wrapper--small"><div aria-hidden="true" class="social-media-sharer__icon social-media-sharer__icon--solid">
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M24 11.5c0-1.65-1.35-3-3-3-.96 0-1.86.48-2.42 1.24-1.64-1-3.75-1.64-6.07-1.72.08-1.1.4-3.05 1.52-3.7.72-.4 1.73-.24 3 .5C17.2 6.3 18.46 7.5 20 7.5c1.65 0 3-1.35 3-3s-1.35-3-3-3c-1.38 0-2.54.94-2.88 2.22-1.43-.72-2.64-.8-3.6-.25-1.64.94-1.95 3.47-2 4.55-2.33.08-4.45.7-6.1 1.72C4.86 8.98 3.96 8.5 3 8.5c-1.65 0-3 1.35-3 3 0 1.32.84 2.44 2.05 2.84-.03.22-.05.44-.05.66 0 3.86 4.5 7 10 7s10-3.14 10-7c0-.22-.02-.44-.05-.66 1.2-.4 2.05-1.54 2.05-2.84zM2.3 13.37C1.5 13.07 1 12.35 1 11.5c0-1.1.9-2 2-2 .64 0 1.22.32 1.6.82-1.1.85-1.92 1.9-2.3 3.05zm3.7.13c0-1.1.9-2 2-2s2 .9 2 2-.9 2-2 2-2-.9-2-2zm9.8 4.8c-1.08.63-2.42.96-3.8.96-1.4 0-2.74-.34-3.8-.95-.24-.13-.32-.44-.2-.68.15-.24.46-.32.7-.18 1.83 1.06 4.76 1.06 6.6 0 .23-.13.53-.05.67.2.14.23.06.54-.18.67zm.2-2.8c-1.1 0-2-.9-2-2s.9-2 2-2 2 .9 2 2-.9 2-2 2zm5.7-2.13c-.38-1.16-1.2-2.2-2.3-3.05.38-.5.97-.82 1.6-.82 1.1 0 2 .9 2 2 0 .84-.53 1.57-1.3 1.87z"/></svg>
+ </div>
+ </div>
+ </a>
+
+ </div>
+
+ </div>
+
+ <div class="content-header__authors">
+ <ol class="content-header__author_list" aria-label="Authors of this article">
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#x8d8d9914" data-behaviour="Popup" class="content-header__author_link">Matthew J Kourakis</a><span class="content-header__author_suffix"><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#xf3e51472" data-behaviour="Popup" class="content-header__author_link">Cezar Borba</a><span class="content-header__author_suffix"><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#xb536f34f" data-behaviour="Popup" class="content-header__author_link">Angela Zhang</a><span class="content-header__author_suffix"><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#x1d85dfc3" data-behaviour="Popup" class="content-header__author_link">Erin Newman-Smith</a><span class="content-header__author_suffix"><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#x6107dd5d" data-behaviour="Popup" class="content-header__author_link">Priscilla Salas</a><span class="content-header__author_suffix"><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#x8b937bbf" data-behaviour="Popup" class="content-header__author_link">B Manjunath</a><span class="content-header__author_suffix"><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ <li class="content-header__author_list_item">
+ <span class="content-header__author"><a href="/articles/44753#xa3814a31" data-behaviour="Popup" class="content-header__author_link">William C Smith</a><span class="content-header__author_suffix">&nbsp;<picture>
+ <source srcset="/assets/patterns/img/icons/corresponding-author.d7eda27b.svg" type="image/svg+xml">
+ <img src="/assets/patterns/img/icons/corresponding-author@1x.89247d49.png"
+ srcset="/assets/patterns/img/icons/corresponding-author@2x.808ab270.png 2x, /assets/patterns/img/icons/corresponding-author@1x.89247d49.png 1x"
+ alt="Is a corresponding author" class="content-header__author_icon">
+ </picture><span class="content-header__author_separator" aria-hidden="true">,</span>
+ </span></span>
+ </li>
+ </ol>
+
+ <ol class="content-header__institution_list" aria-label="Author institutions">
+ <li class="content-header__institution_list_item">
+ <span class="content-header__institution">University of California, Santa Barbara, United States<span class="content-header__institution_separator" aria-hidden="true">;</span>
+ </span>
+ </li>
+ </ol>
+ </div>
+
+
+ <div class="content-header__meta">
+ <div class="meta">
+
+ <a class="meta__type" href="/articles/research-article" >Research Article</a>
+
+
+
+ <span class="date"> <time datetime="2019-04-18">Apr 18, 2019</time></span>
+ </div>
+ </div>
+
+
+</header>
+
+
+
+
+
+
+ <div class="wrapper">
+
+ <div class="contextual-data">
+
+ <ul class="contextual-data__list" aria-label="The following contains the number of views, citations and annotations in this article">
+
+ <li class="contextual-data__item"><a href="/articles/44753#metrics">Cited 0</a></li>
+ <li class="contextual-data__item"><a href="/articles/44753#metrics">Views 807</a></li>
+
+ <li class="contextual-data__item" data-hypothesis-trigger><span class="contextual-data__item__hypothesis_opener">Annotations</span> <button class="speech-bubble speech-bubble--small "
+ data-behaviour="SpeechBubble HypothesisOpener"
+
+aria-live="polite">
+ <span class="speech-bubble__inner"><span aria-hidden="true"><span data-visible-annotation-count></span></span><span class="visuallyhidden"> Open annotations. The current annotation count on this page is <span data-hypothesis-annotation-count>being calculated</span>.</span></span>
+</button>
+</li>
+
+ </ul>
+
+ <div class="contextual-data__cite_wrapper">
+ <span class="contextual-data__cite"><span class="contextual-data__cite_label">Cite <span class="visuallyhidden"> this article</span> as:</span> eLife 2019;8:e44753</span>
+ <span class="doi">doi: <a href="https://doi.org/10.7554/eLife.44753" class="doi__link">10.7554/eLife.44753</a></span>
+ </div>
+
+</div>
+
+
+ </div>
+
+
+
+ <div data-behaviour="DelegateBehaviour" data-delegate-behaviour="Popup" data-selector=".article-section:not(#abstract) a">
+
+
+ <div class="wrapper wrapper--content">
+
+ <div class="grid">
+
+
+
+ <div class="grid__item one-whole x-large--two-twelfths">
+
+ <div class="view-selector view-selector--has-figures" data-behaviour="ViewSelector" data-side-by-side-link="https://lens.elifesciences.org/44753">
+ <ul class="view-selector__list">
+ <li class="view-selector__list-item view-selector__list-item--article view-selector__list-item--active">
+ <a href="/articles/44753" class="view-selector__link view-selector__link--article"><span>Article</span></a>
+ </li>
+ <li class="view-selector__list-item view-selector__list-item--figures">
+ <a href="/articles/44753/figures" class="view-selector__link view-selector__link--figures"><span>Figures and data</span></a>
+ </li>
+
+ <li class="view-selector__list-item view-selector__list-item--jump">
+ <span class="view-selector__jump_links_header">Jump to</span>
+ <ul class="view-selector__jump_links">
+ <li class="view-selector__jump_link_item">
+ <a href="#abstract" class="view-selector__jump_link">Abstract</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#s1" class="view-selector__jump_link">Introduction</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#s2" class="view-selector__jump_link">Results</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#s3" class="view-selector__jump_link">Discussion</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#s4" class="view-selector__jump_link">Materials and methods</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#references" class="view-selector__jump_link">References</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#SA1" class="view-selector__jump_link">Decision letter</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#SA2" class="view-selector__jump_link">Author response</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#info" class="view-selector__jump_link">Article and author information</a>
+ </li>
+ <li class="view-selector__jump_link_item">
+ <a href="#metrics" class="view-selector__jump_link">Metrics</a>
+ </li>
+ </ul>
+ </li>
+
+ </ul>
+</div>
+
+ </div>
+
+
+ <div class="content-container grid__item one-whole
+
+ large--eight-twelfths x-large--seven-twelfths
+ grid-column">
+
+
+
+
+
+ <section
+ class="article-section article-section--first"
+ id="abstract"
+ data-behaviour="ArticleSection"
+
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Abstract</h2>
+ </header>
+
+ <div class="article-section__body">
+
+
+ </div>
+
+</section>
+
+
+
+ <section
+ class="article-section "
+ id="s2"
+ data-behaviour="ArticleSection"
+ data-initial-state="closed"
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Results</h2>
+ </header>
+
+ <div class="article-section__body">
+ <section
+ class="article-section "
+ id="s2-1"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Glutamatergic and GABAergic photoreceptors</h3>
+ </header>
+
+ <div class="article-section__body">
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s2-2"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Posterior brain vesicle relay neurons are mixed VGAT- and VACHT-expressing</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Sensory input from the photoreceptors, antenna cells, coronet cells, bipolar tail neurons and a subset of peripheral neurons is directed to a cluster of ~30 RNs in the pBV. These RNs in turn extend axons through the neck to the MG. Among this cluster are the six prRNs and eight pr-AMG RNs (<a href="#fig1">Figure 1</a>; (<a href="#bib43">Ryan et al., 2016</a>)). Previous in situ hybridization studies identified VGAT- and VACHT-expressing neurons in the appropriate place in the BV to be RNs (<a href="#bib56">Yoshida et al., 2004</a>). Moreover, these neurons project axons posteriorly to the MG, a defining characteristic of the pBV RNs. BV neurons expressing other major NTs, including glutamate, dopamine, and serotonin, are neither in the correct brain region to be RNs, nor do they project from the BV to the MG ([<a href="#bib20">Horie et al., 2008b</a>; <a href="#bib31">Moret et al., 2005</a>; <a href="#bib39">Pennati et al., 2007</a>], and our observations). By HCR in situ we observed that the pBV RNs cluster in two distinct groups along the anterior/posterior axis, with the anterior cluster expressing VACHT, and the posterior group expressing VGAT (<a href="#fig3">Figure 3a</a>). We observed an average of 16 (±1.6, n = 9 larvae) VGAT-positive neurons and 11 (±1, n = 8 larvae) VACHT-positive neurons.</p>
+ <div
+ id="fig3"
+ class="asset-viewer-inline asset-viewer-inline-- "
+ data-variant=""
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="fig3"
+ data-asset-viewer-uri="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/1500,/0/default.jpg"
+ data-asset-viewer-width="1500"
+ data-asset-viewer-height="1109"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Figure 3</span> with 1 supplement <a href="/articles/44753/figures#fig3" class="asset-viewer-inline__header_link">see all</a>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9paWlmLmVsaWZlc2NpZW5jZXMub3JnL2xheDo0NDc1MyUyRmVsaWZlLTQ0NzUzLWZpZzMtdjIudGlmL2Z1bGwvZnVsbC8wL2RlZmF1bHQuanBn/elife-44753-fig3-v2.jpg?_hash=poB6zI7Tss9wKOFGYwhd40WSG4X9%2B4%2FgYw9ffJwpELo%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/1500,/0/default.jpg" class="asset-viewer-inline__open_link" target="_blank" rel="noopener noreferrer"><span class="visuallyhidden">Open asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/1500,/0/default.jpg" class="captioned-asset__link" target="_blank" rel="noopener noreferrer">
+ <picture class="captioned-asset__picture">
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/1234,/0/default.webp 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/617,/0/default.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/1234,/0/default.jpg 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/617,/0/default.jpg 1x"
+ type="image/jpeg"
+ >
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig3-v2.tif/full/617,/0/default.jpg"
+
+ alt=""
+ class="captioned-asset__image"
+ >
+ </picture>
+ </a>
+
+
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">Neurotransmitter use in the relay neurons.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">(<b>a</b>) In situ hybridization of VGAT and VACHT to the relay neurons in the brain vesicle. Also visible is the anterior tip of the motor ganglion. Nuclei are shown as spheres. (<b>b</b>) Confusion matrix for relay neuron registration. (<b>c</b>) Confusion matrix for relay neurons grouped by type. (<b>d</b>) Heat map of neurotransmitter predictions from cell registration of relay neurons, with scale showing color by proportion of iterations predicting either VGAT or VACHT. Abbreviations: ant., anterior; post., posterior; dor., dorsal; vent., ventral; MG, motor ganglion; pr-AMG RN, photoreceptor ascending motor ganglion relay neuron; prRN, photoreceptor relay neuron; AntRN, antenna cell relay neuron; PBRN, photoreceptor-bipolar tail neuron relay neuron; PCRN, photoreceptor-coronet relay neuron; PNRN, peripheral relay neuron; VGAT, vesicular GABA transporter; VACHT, vesicular acetylcholine transporter.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.007" class="doi__link">https://doi.org/10.7554/eLife.44753.007</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+<p class="paragraph">Unlike the ocellus, the pBV RN cluster does not have obvious anatomical features, although the various classes of RNs are clustered, with, for example, the antenna cell RNs (AntRN) being posterior to the photoreceptor RNs (<a href="/articles/44753/figures#fig3s1">Figure 3—figure supplement 1</a>; <a href="#bib43">Ryan et al., 2016</a>). However, given the diversity of RN types in the pBV it is unlikely that the expression domains of VGAT and VACHT precisely correspond to the clusters of RN classes. In order to make predictions of NT use in the RNs we used the same registration approach as with the photoreceptors (n = 7 VGAT/VACHT double in situ datasets, <a href="/articles/44753/figures#fig3s1">Figure 3—figure supplement 1</a>). The confusion matrix for the RNs shows a lower level of convergence than for the PR-Is, suggesting that the cellular anatomy of the RN cluster is less structured than the ocellus (<a href="#fig3">Figure 3b</a>; <a href="/articles/44753/figures#fig3s1">Figure 3—figure supplement 1</a>). However, the confusion matrix also shows that the RNs are most often confused for other RNs of the same class (white boxes in <a href="#fig3">Figure 3b</a>). This is most evident when the registration is performed not with single cells, but with pooled RNs of each class (<a href="#fig3">Figure 3c</a>), and is presumably a reflection of the clustering of RN classes in the pBV. Thus we can have higher confidence in the NT use by RN class than we can have in individual neuron identities. For example, the connectome shows the AntRNs are clustered at the rear of the BV (<a href="/articles/44753/figures#fig3s1">Figure 3—figure supplement 1</a>; (<a href="#bib43">Ryan et al., 2016</a>)), as are the VGAT expressing neurons (<a href="#fig3">Figure 3a</a>; <a href="/articles/44753/figures#fig3s1">Figure 3—figure supplement 1</a>). Accordingly, the registration predicts that eight of the ten AntRNs are VGAT positive (<a href="#fig3">Figure 3c</a>). For the present study, which focuses on the visuomotor pathway, the registration predicts that five of the eight pr-AMG RNs are VGAT expressing, two are VACHT expressing, and one (pr-AMG RN 157) cannot be resolved (no dual VGAT/VACHT expression was observed in the <i>in situs</i>). On the other hand, the registration predicts that the six prRNs are evenly mixed between VGAT and VACHT expression. These predictions provide starting points for experimental validation detailed below.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s2-3"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">The motor ganglion contains a mixture of cholinergic and GABAergic neurons</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">The MG contains five left/right pairs of motor neurons, as well as several classes of interneurons, including six MGINs, seven AMGs, two ddNs, and two posterior MG interneurons (<a href="#bib43">Ryan et al., 2016</a>). Also described in the MG are two left/right pairs of decussating VGAT-positive neurons (<a href="#bib21">Horie et al., 2009</a>; <a href="#bib35">Nishino et al., 2010</a>). These are likely the same decussating MG neurons as described in the connectome, although the names are slightly different (<i>anterior caudal inhibitory neurons</i> (<a href="#bib21">Horie et al., 2009</a>) versus <i>ascending contralateral inhibitory neurons</i> (<a href="#bib43">Ryan et al., 2016</a>), both abbreviated as ACIN). However, the connectome reports only three ACINs, with the anterior ACIN not paired. It was speculated that this was an anomalous feature of the particular larva used for the ssEM. Supporting this, a second larva being analyzed by ssEM for connectomics shows two pairs of ACINs (K. Ryan, personal communication).</p>
+<p class="paragraph">Like the ocellus, the MG has a well-defined anterior-to-posterior and dorsal-to-ventral cellular anatomy (<a href="#fig4">Figure 4a and b</a>; <a href="#bib43">Ryan et al., 2016</a>; <a href="#bib45">Ryan et al., 2018</a>). Neurotransmitter use by some MG neurons is already documented, including the motor neurons, which are cholinergic (<a href="#bib53">Takamura et al., 2010</a>; <a href="#bib52">Takamura et al., 2002</a>), and the ACINs which are glycinergic (<a href="#bib35">Nishino et al., 2010</a>). By HCR in situ hybridization we observed VGAT- and VACHT-positive neurons in the MG (<a href="#fig4">Figure 4b</a>), but no VGLUT- or TH-positive cells (data not shown). These results are consistent with previous studies (<a href="#bib20">Horie et al., 2008b</a>; <a href="#bib31">Moret et al., 2005</a>). Likewise it was reported that no serotonergic cells were present in the MG (<a href="#bib39">Pennati et al., 2007</a>). As with the RNs, the VGAT- and VACHT-expressing neurons in the MG are segregated anatomically. We also found a population of 6–7 cells between the AMGs and the MNs (asterisks in <a href="#fig4">Figure 4a</a>), that were not annotated in the connectome as neurons and that failed to label with any of our NT markers. We hypothesize that these are ependymal cells, which are abundant in the nerve cord immediately caudal to this region.</p>
+ <div
+ id="fig4"
+ class="asset-viewer-inline asset-viewer-inline-- "
+ data-variant=""
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="fig4"
+ data-asset-viewer-uri="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/,1500/0/default.jpg"
+ data-asset-viewer-width="1274"
+ data-asset-viewer-height="1500"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Figure 4</span> with 1 supplement <a href="/articles/44753/figures#fig4" class="asset-viewer-inline__header_link">see all</a>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9paWlmLmVsaWZlc2NpZW5jZXMub3JnL2xheDo0NDc1MyUyRmVsaWZlLTQ0NzUzLWZpZzQtdjIudGlmL2Z1bGwvZnVsbC8wL2RlZmF1bHQuanBn/elife-44753-fig4-v2.jpg?_hash=ywXgEBLsOzGfI3rEs2OHLvcZSgqwkJ8EhBicEWmfAJ8%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/,1500/0/default.jpg" class="asset-viewer-inline__open_link" target="_blank" rel="noopener noreferrer"><span class="visuallyhidden">Open asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/,1500/0/default.jpg" class="captioned-asset__link" target="_blank" rel="noopener noreferrer">
+ <picture class="captioned-asset__picture">
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/1234,/0/default.webp 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/617,/0/default.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/1234,/0/default.jpg 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/617,/0/default.jpg 1x"
+ type="image/jpeg"
+ >
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig4-v2.tif/full/617,/0/default.jpg"
+
+ alt=""
+ class="captioned-asset__image"
+ >
+ </picture>
+ </a>
+
+
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">Neurotransmitter use in the motor ganglion.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">(<b>a</b> and <b>b</b>) Expression of VGAT and VACHT by in situ hybridization in the motor ganglion, lateral (<b>a</b>) and dorsal (<b>b</b>) views. Asterisks indicate predicted ependymal cells. (<b>c</b>) Lateral view of VGAT expression in the AMGs. (<b>d</b>) shows same view as c, but with VACHT expression. (<b>e</b>) Diagram of neurons in the motor ganglion (derived from Figure 1 of <a href="#bib44">Ryan et al., 2017</a>). Box indicates approximate positions of panels c and d. Lateral view; anterior is to the left. (<b>f</b>) Dorsal view of VGAT expression in the AMGs. Asterisk indicates central non-VGAT expressing cell. (<b>g</b>) Three dimensional surface rendering of VGAT expressing cells in the AMGs. (<b>h</b>) Diagram of a dorsal view of the motor ganglion. AMG cells are numbered. Abbreviations: dor., dorsal; vent., ventral; ant., anterior; post., posterior; AMG, ascending motor ganglion neuron; MGIN, motor ganglion interneuron; ddN, descending decussating neurons; ACIN, ascending contralateral inhibitory neurons; MN, motor neuron; VGAT, vesicular GABA transporter; VACHT, vesicular acetylcholine transporter.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.009" class="doi__link">https://doi.org/10.7554/eLife.44753.009</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+<p class="paragraph">Because of the highly structured MG cellular anatomy, we can identify the various MG cell types in the in situ data. The anterior group of VGAT-positive cells is clustered dorsally in the MG, and correspond to AMGs (4 c, d and e; (<a href="#bib44">Ryan et al., 2017</a>)). In a dorsal view of the MG (<a href="#fig4">Figure 4f,g and h</a>) a ring of VGAT-positive cells was observed with a non-VGAT expressing cell in the center (asterisk, <a href="#fig4">Figure 4f and g</a>). The VGAT-expressing cells appear to be AMGs 1, 2, 3, 4, 6, and 7, while the central cell, which is instead positive for VACHT, appears to be AMG5. The connectome shows that AMG5 differs in its connectivity from the other AMGs. Significantly, AMG5 is the principle synaptic input for PNS neurons. It then synapses to the other AMGs, which in turn project their axons to other cells in the MG, including MGINs and MNs, as well as to the pr-AMG RNs in the BV. In the posterior of the MG we observed two pairs of VGAT-positive neurons, as described previously (<a href="#bib21">Horie et al., 2009</a>). Finally, in the ventral MG we observed a continuous block of VACHT expression that encompasses the anterior three pairs of MNs, the ddNs, and the MGINs. Similar in situ patterns were observed in most larvae (<a href="/articles/44753/figures#fig4s1">Figure 4—figure supplement 1</a>), although the positions of the ACINs were offset in several (see larvae 5 and 6 in <a href="/articles/44753/figures#fig4s1">Figure 4—figure supplement 1</a>), and one larva was observed to be missing both one motor neuron and one ACIN (larva 7in <a href="/articles/44753/figures#fig4s1">Figure 4—figure supplement 1</a>), suggesting that MG variants, such as was observed in the animal used in the connectome study, may be relatively common.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s2-4"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Parallel visuomotor circuits</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Our results indicate that the PR-Is, with the exception of two cells, are glutamatergic, while the PR-IIs are a mixture of GABAergic and GABA/glutamatergic. The <i>Ciona</i> genome contains a single glutamate AMPA receptor (AMPAR) (<a href="#bib36">Okamura et al., 2005</a>) that is expressed in larvae in the two antenna cells, and in a small cluster of neurons in the pBV (<a href="#bib18">Hirai et al., 2017</a>). Published results show that most of the pBV group of AMPAR-positive neurons are clustered at the ends of Arrestin-labeled photoreceptor axons, and that they extend their axons to the MG, suggesting they are photoreceptor RNs (see Figure 2B" in <a href="#bib18">Hirai et al., 2017</a>). We find that this pBV group is composed of ~6 cells (<a href="/articles/44753/figures#fig5s1">Figure 5—figure supplement 1</a>). To investigate this further, we co-expressed an pAMPAR &gt;GFP construct (<a href="#bib18">Hirai et al., 2017</a>) with pVACHT &gt;CFP and pVGAT &gt;nuclear RFP constructs. We observed coexpression of the AMPAR reporter in a subset of the VACHT-positive RNs, but never in the VGAT-expressing RNs (<a href="#fig5">Figure 5a</a>).</p>
+ <div
+ id="fig5"
+ class="asset-viewer-inline asset-viewer-inline-- "
+ data-variant=""
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="fig5"
+ data-asset-viewer-uri="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/,1500/0/default.jpg"
+ data-asset-viewer-width="1400"
+ data-asset-viewer-height="1500"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Figure 5</span> with 1 supplement <a href="/articles/44753/figures#fig5" class="asset-viewer-inline__header_link">see all</a>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9paWlmLmVsaWZlc2NpZW5jZXMub3JnL2xheDo0NDc1MyUyRmVsaWZlLTQ0NzUzLWZpZzUtdjIudGlmL2Z1bGwvZnVsbC8wL2RlZmF1bHQuanBn/elife-44753-fig5-v2.jpg?_hash=0aDXVHgnozrp0Q80t8%2FT5K718EzJQmgreXzfYFK9oAQ%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/,1500/0/default.jpg" class="asset-viewer-inline__open_link" target="_blank" rel="noopener noreferrer"><span class="visuallyhidden">Open asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/,1500/0/default.jpg" class="captioned-asset__link" target="_blank" rel="noopener noreferrer">
+ <picture class="captioned-asset__picture">
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/1234,/0/default.webp 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/617,/0/default.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/1234,/0/default.jpg 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/617,/0/default.jpg 1x"
+ type="image/jpeg"
+ >
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig5-v2.tif/full/617,/0/default.jpg"
+
+ alt=""
+ class="captioned-asset__image"
+ >
+ </picture>
+ </a>
+
+
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">AMPA receptors in negative phototaxis.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">(<b>a</b>) Coexpression of an AMPA-receptor and VACHT expression constructs in the relay neurons (white asterisks). The main panel shows the merge while smaller panels at right show single channels. (<b>b</b>) Negative phototaxis assay in control larvae. Yellow arrow indicates direction of 505 nm light. By 60 min (m) the majority of the larvae have swum to the side of the dish away from the light (red arrow). (<b>c</b>) Perampanel-treated larvae do not show negative phototaxis. (<b>d</b>) Quantification of negative phototaxis in control and perampanel-treated larvae. Points indicate the averages from three independent assays, ±standard deviation. Y-axis represents the percentage of larvae found on the side away from the light source (distal third). Abbreviations: VGAT, vesicular GABA transporter; VACHT, vesicular acetylcholine transporter.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.011" class="doi__link">https://doi.org/10.7554/eLife.44753.011</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+<p class="paragraph">To assess the function of the AMPAR-positive cells in <i>Ciona</i> visuomotor behaviors we used the non-competitive AMPAR antagonist perampanel (<a href="#bib17">Hanada et al., 2011</a>). For the assay, larvae were treated at 25 hr post fertilization (hpf) with perampanel in sea water and compared to vehicle-treated control larvae for both negative phototaxis and response to light dimming. The negative phototaxis assay consisted of placing the larvae in a 10 cm petri dish of sea water with a 505 nm LED lamp placed to one side (described by us previously <a href="#bib46">Salas et al., 2018</a>). Images were collected at 1 min intervals over 5 hr to assess for taxis (<a href="#video1">Video 1</a>). <a href="#fig5">Figure 5b and c</a> show representative frames from the time-lapse capture at the start and at 60 min for control and perampanel-treated larvae, respectively. In the control sample the larvae at 60 min were observed to cluster at the side of the petri dish away from the light (distal side; red arrows in <a href="#fig5">Figure 5b</a>). By contrast no taxis was observed in the perampanel treated larvae (<a href="#fig5">Figure 5c</a>). Combined results from three independent assays (n = 129–365 larvae per group) are shown in <a href="#fig5">Figure 5d</a> and presented as the percent of larvae found on distal third of the petri dish. For control larvae ~ 70% swam to the distal third within 1 hr, while the perampanel-treated larvae remained evenly distributed across the dish.</p>
+ <div
+ id="video1"
+ class="asset-viewer-inline asset-viewer-inline--video "
+ data-variant="video"
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="video1"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Video 1</span>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9zdGF0aWMtbW92aWUtdXNhLmdsZW5jb2Vzb2Z0d2FyZS5jb20vbXA0LzEwLjc1NTQvODg2Lzc3MWIyN2VkMjZmNzI1MTEwOGJkMzViODQyY2U1OTYzZTYzNDExOTkvZWxpZmUtNDQ3NTMtdmlkZW8xLm1wNA==/elife-44753-video1.mp4?_hash=QotDe4lMfotdXdc%2BUKEblblnp1b0B6bupqA3BcEJbnU%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+
+
+ <video controls="controls" poster="https://iiif.elifesciences.org/lax:44753%2Felife-44753-video1.jpg/full/639,/0/default.jpg" preload="metadata">
+
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-video1.jpg/full/639,/0/default.jpg" alt="posterframe for video" />
+
+ <p>This video cannot be played in place because your browser does support HTML5 video. You may still download the video for offline viewing.</p>
+
+ <source src="https://static-movie-usa.glencoesoftware.com/mp4/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video1.mp4" type='video/mp4; codecs=&quot;avc1.42E01E, mp4a.40.2&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/mp4/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video1.mp4">Download as MPEG-4</a>
+
+ </div>
+ <source src="https://static-movie-usa.glencoesoftware.com/webm/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video1.webm" type='video/webm; codecs=&quot;vp8.0, vorbis&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/webm/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video1.webm">Download as WebM</a>
+
+ </div>
+ <source src="https://static-movie-usa.glencoesoftware.com/ogv/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video1.ogv" type='video/ogg; codecs=&quot;theora, vorbis&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/ogv/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video1.ogv">Download as Ogg</a>
+
+ </div>
+
+ </video>
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">Negative phototaxis of control and perampanel-treated <i>Ciona</i> larvae in 10 cm petri dishes.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">Directional 505 nm illumination is from the left. Frames were taken at 1 per minute over five hours. In the video the 5 hr is compressed to 15 s (i.e., 1200X normal speed). Black and white tones were inverted to make the larvae more visible.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.014" class="doi__link">https://doi.org/10.7554/eLife.44753.014</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+<p class="paragraph">The inability of the perampanel-treated larvae to undergo phototaxis was not the result of an inability to swim, as seen in <a href="#video2">Video 2</a> which was taken at 8.9 fps, with and without perampanel. Moreover, we observed that perampanel treatment had no effect on the light dimming response (<a href="#video3">Video 3</a>). <a href="#fig6">Figure 6a and b</a> show 5 s projection images from <a href="#video3">Video 3</a> immediately before and after dimming. In these images swims appear as lines, and the responses in control and perampanel-treated larvae appear qualitatively similar. To quantitatively compare dimming response, control and perampanel-treated larvae were exposed to a range of dimming intensities from 2 to 60-fold and the percentage of larvae responding was measured and presented as a percentage in <a href="#fig6">Figure 6c</a> (results are from three independent assays, with 46–139 larvae per group). The percentage responding at all intensities was very similar for both groups, and pair-wise comparisons at each fold change failed to show significance. In addition, no differences were measured in the velocity or duration of swims in pair-wise comparisons of control and perampanel-treated larvae at any fold-dimming (data not shown). We conclude that there is no change in sensitivity to dimming caused by perampanel treatment, while phototaxis was completely disrupted. Finally, we also observed that the touch response was not inhibited by perampanel (data not shown), despite the presence of VGLUT-positive epidermal sensory neurons (<a href="#bib20">Horie et al., 2008b</a>). This would appear to agree with the observation that primary RNs for the PNS, the eminens cells and the AMGs do not express the AMPAR (<a href="#bib18">Hirai et al., 2017</a>; and our observations). In addition to the AMPAR, the <i>Ciona</i> genome contains several other glutamate receptors including one kainate and one NMDA (<a href="#bib36">Okamura et al., 2005</a>), although their expression has not been characterized.</p>
+ <div
+ id="fig6"
+ class="asset-viewer-inline asset-viewer-inline-- "
+ data-variant=""
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="fig6"
+ data-asset-viewer-uri="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/1500,/0/default.jpg"
+ data-asset-viewer-width="1500"
+ data-asset-viewer-height="1124"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Figure 6</span>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9paWlmLmVsaWZlc2NpZW5jZXMub3JnL2xheDo0NDc1MyUyRmVsaWZlLTQ0NzUzLWZpZzYtdjIudGlmL2Z1bGwvZnVsbC8wL2RlZmF1bHQuanBn/elife-44753-fig6-v2.jpg?_hash=v4N145cqneQAaynzcjF%2FnAcen6AeM%2BkieeWEMRTMIFY%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/1500,/0/default.jpg" class="asset-viewer-inline__open_link" target="_blank" rel="noopener noreferrer"><span class="visuallyhidden">Open asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+ <a href="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/1500,/0/default.jpg" class="captioned-asset__link" target="_blank" rel="noopener noreferrer">
+ <picture class="captioned-asset__picture">
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/1234,/0/default.webp 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/617,/0/default.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/1234,/0/default.jpg 2x, https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/617,/0/default.jpg 1x"
+ type="image/jpeg"
+ >
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-fig6-v2.tif/full/617,/0/default.jpg"
+
+ alt=""
+ class="captioned-asset__image"
+ >
+ </picture>
+ </a>
+
+
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">Perampanel does not disrupt the light dimming response.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">(<b>a</b>) Light dimming response in control larvae. Shown are 5 s (s) projections from time-lapse videos in which swims appear as lines. Left panel shows a projection 5 s before dimming, and right panel 5 s after dimming. (<b>b</b>) same as a, but larvae were perampanel-treated. (<b>c</b>) Quantification of light dimming response in control and perampanel treated larvae. Larvae were exposed to dimming of 505 nm light from 2- to 60-fold. Dimming response was scored as percent of larvae responding. Bars show averages of three independent assays ± standard deviation.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.015" class="doi__link">https://doi.org/10.7554/eLife.44753.015</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+ <div
+ id="video2"
+ class="asset-viewer-inline asset-viewer-inline--video "
+ data-variant="video"
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="video2"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Video 2</span>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9zdGF0aWMtbW92aWUtdXNhLmdsZW5jb2Vzb2Z0d2FyZS5jb20vbXA0LzEwLjc1NTQvODg2Lzc3MWIyN2VkMjZmNzI1MTEwOGJkMzViODQyY2U1OTYzZTYzNDExOTkvZWxpZmUtNDQ3NTMtdmlkZW8yLm1wNA==/elife-44753-video2.mp4?_hash=Td7A8NTjXLYjBj4RDkmj2zYo4EbpSS%2FCisGNZPPo1Ws%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+
+
+ <video controls="controls" poster="https://iiif.elifesciences.org/lax:44753%2Felife-44753-video2.jpg/full/639,/0/default.jpg" preload="metadata">
+
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-video2.jpg/full/639,/0/default.jpg" alt="posterframe for video" />
+
+ <p>This video cannot be played in place because your browser does support HTML5 video. You may still download the video for offline viewing.</p>
+
+ <source src="https://static-movie-usa.glencoesoftware.com/mp4/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video2.mp4" type='video/mp4; codecs=&quot;avc1.42E01E, mp4a.40.2&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/mp4/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video2.mp4">Download as MPEG-4</a>
+
+ </div>
+ <source src="https://static-movie-usa.glencoesoftware.com/webm/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video2.webm" type='video/webm; codecs=&quot;vp8.0, vorbis&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/webm/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video2.webm">Download as WebM</a>
+
+ </div>
+ <source src="https://static-movie-usa.glencoesoftware.com/ogv/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video2.ogv" type='video/ogg; codecs=&quot;theora, vorbis&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/ogv/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video2.ogv">Download as Ogg</a>
+
+ </div>
+
+ </video>
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">Swimming of control and perampanel-treated <i>Ciona</i> larvae in a directional light field.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">Larvae in 10 cm petri dishes were recorded at nine frames/second. Black and white tones were inverted to make the larvae more visible. The video plays at 5X normal speed.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.017" class="doi__link">https://doi.org/10.7554/eLife.44753.017</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+ <div
+ id="video3"
+ class="asset-viewer-inline asset-viewer-inline--video "
+ data-variant="video"
+ data-behaviour="AssetNavigation AssetViewer ToggleableCaption"
+ data-selector=".caption-text__body"
+ data-asset-viewer-group="video3"
+ >
+
+ <div class="asset-viewer-inline__header_panel">
+ <div class="asset-viewer-inline__header_text">
+ <span class="asset-viewer-inline__header_text__prominent">Video 3</span>
+ </div>
+
+
+ <div class="asset-viewer-inline__figure_access">
+ <a href="https://elifesciences.org/download/aHR0cHM6Ly9zdGF0aWMtbW92aWUtdXNhLmdsZW5jb2Vzb2Z0d2FyZS5jb20vbXA0LzEwLjc1NTQvODg2Lzc3MWIyN2VkMjZmNzI1MTEwOGJkMzViODQyY2U1OTYzZTYzNDExOTkvZWxpZmUtNDQ3NTMtdmlkZW8zLm1wNA==/elife-44753-video3.mp4?_hash=GCjCBk4K%2BjkExSoFO0Wc8pv%2FhPFCkRZ84QpH%2Bavz994%3D" class="asset-viewer-inline__download_all_link" download="Download"><span class="visuallyhidden">Download asset</span></a>
+ </div>
+
+ </div>
+
+ <figure class="captioned-asset">
+
+
+
+ <video controls="controls" poster="https://iiif.elifesciences.org/lax:44753%2Felife-44753-video3.jpg/full/639,/0/default.jpg" preload="metadata">
+
+ <img src="https://iiif.elifesciences.org/lax:44753%2Felife-44753-video3.jpg/full/639,/0/default.jpg" alt="posterframe for video" />
+
+ <p>This video cannot be played in place because your browser does support HTML5 video. You may still download the video for offline viewing.</p>
+
+ <source src="https://static-movie-usa.glencoesoftware.com/mp4/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video3.mp4" type='video/mp4; codecs=&quot;avc1.42E01E, mp4a.40.2&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/mp4/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video3.mp4">Download as MPEG-4</a>
+
+ </div>
+ <source src="https://static-movie-usa.glencoesoftware.com/webm/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video3.webm" type='video/webm; codecs=&quot;vp8.0, vorbis&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/webm/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video3.webm">Download as WebM</a>
+
+ </div>
+ <source src="https://static-movie-usa.glencoesoftware.com/ogv/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video3.ogv" type='video/ogg; codecs=&quot;theora, vorbis&quot;' class="media-source"/>
+
+ <div>
+
+
+ <a class="media-source__fallback_link" href="https://static-movie-usa.glencoesoftware.com/ogv/10.7554/886/771b27ed26f7251108bd35b842ce5963e6341199/elife-44753-video3.ogv">Download as Ogg</a>
+
+ </div>
+
+ </video>
+
+
+ <figcaption class="captioned-asset__caption">
+
+ <h6 class="caption-text__heading">Dimming response of control and perampanel-treated <i>Ciona</i> larvae in 10 cm petri dishes.</h6>
+
+
+ <div class="caption-text__body"><p class="paragraph">Larvae were imaged for 70 s at five frames/second, with dimming of 505 nm ambient light at 10 s. Black and white tones were inverted, and thus the dimming appears as a brightening. The video plays at 5X normal speed.</p>
+</div>
+
+ <span class="doi doi--asset"><a href="https://doi.org/10.7554/eLife.44753.018" class="doi__link">https://doi.org/10.7554/eLife.44753.018</a></span>
+
+ </figcaption>
+
+
+
+
+ </figure>
+
+
+ </div>
+<p class="paragraph">In summary, we are able to separate the phototaxis and dimming behaviors pharmacologically. Moreover, we can identify the VACHT/AMPAR-positive RNs as essential for an excitatory PR-I circuit that involves presynaptic glutamatergic PR-Is and postsynaptic cholinergic MGINs. The number and location of the VACHT/AMPAR-positive RNs, the circuit logic, and our behavioral observations are all consistent with these being prRNs.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s2-5"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">A disinhibitory circuit</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Of equal significance to our observation that navigation is inhibited by perampanel, is our observation that the dimming response, which is mediated by the PR-IIs (<a href="#bib46">Salas et al., 2018</a>), is not inhibited by perampanel (<a href="#fig6">Figure 6</a>). Our expression studies show that the PR-IIs are comprised of a mixture of VGAT- and VGAT/VGLUT-expressing photoreceptors. Although it is formally possible that PR-IIs signal exclusively via glutamate in an excitatory circuit via a non-AMPA glutamate receptor on their RNs, our observations that several of the PR-IIs are VGAT-only, as are the majority of the pr-AMG RNs, suggests an alternative disinhibitory circuitry logic. This circuit would consist of the inhibitory PR-IIs synapsing to the pr-AMG RNs to reduce their inhibition on the cholinergic MGINs.</p>
+<p class="paragraph">Implicit in the disinhibitory model is an autonomous level of motor activity in larvae that could be inhibited by the GABAergic pr-AMG RNs, and that this inhibition is released upon stimulation of the GABAergic PR-IIs. We investigated this possibility by two approaches. In the first approach, we inhibited GABAergic receptors with picrotoxin (<a href="#bib37">Olsen, 2014</a>), which should inhibit signals from the GABAergic photoreceptors and the pr-AMG RNs (and most likely the AntRNs), as well as PNS relay neurons, including the eminens cells and the AMGs. The ACINs, which are essential for the central pattern generator (<a href="#bib35">Nishino et al., 2010</a>), are glycinergic and should not be inhibited by picrotoxin. In the second approach, we took advantage of a previously described <i>Ciona</i> mutant, <i>frimousse (frm)</i> (<a href="#bib10">Deschet and Smith, 2004</a>; <a href="#bib16">Hackley et al., 2013</a>). In homozygous <i>frm</i> larvae the anterior BV is transfated to epidermis due to a null mutation in a neurula stage-specific connexin gene (<a href="#bib16">Hackley et al., 2013</a>). <i>Frm</i> larvae thus lack the ocellus pigment cell and photoreceptors, as well as the otolith, although the motor ganglion appears intact (<a href="#bib10">Deschet and Smith, 2004</a>; <a href="#bib16">Hackley et al., 2013</a>).</p>
+<section
+ class="article-section "
+ id="s2-5-1"
+>
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-4"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Hybridization chain reaction (HCR) in situ</h3>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-4" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph"><i>Ciona intestinalis</i>-type B were used for in situ studies and staged to match the animals used in the connectome study (<a href="#bib43">Ryan et al., 2016</a>). Optimized HCR in situ probes for each target transcript were obtained from Molecular Technologies. For detection of GABAergic/glycinergic cells, probes were made to the vesicular GABA transporter gene; for glutamatergic cells, probes were made to the vesicular glutamate transporter for cholinergic cells, probes were made to the vesicular acetylcholine transporter. The sequences from which the HCR probe sets were chosen were assembled from scaffold reads available through the Aniseed website (aniseed.cnrs.fr), and are shown in <a href="/articles/44753/figures#supp1">Supplementary file 1</a>. The in situ protocol followed the previously published <i>Ciona in situ</i> hybridization protocol (<a href="#bib8">Corbo et al., 1997</a>) until the prehybridization step. At this point, the protocol follows the published HCR protocol (<a href="#bib7">Choi et al., 2018</a>), with the following exception: during the amplification stage, incubation with hairpins is performed for 3 days instead of 12–16 hr.</p>
+<p class="paragraph">HCR in situ stained larvae were cleared with Slowfade Gold with DAPI (Invitrogen) and imaged on a Leica SP8 resonant scanning confocal microscope. Imaris v. 9.1 (Bitplane) was used to visualize embryos and assign centroids to nuclei using the ‘add new spots’ function, followed by manual correction when necessary. Nuclei were assigned using the maximum intensity projection, cropped to the area of interest. Volume rendering of in situ patterns was also done using Imaris v. 9.1.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-5"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Cell registration</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">A rotation matrix was calculated based on the 3-dimensional vectors between the anchor cells (ddN and/or antenna cells) and the center of the target cells (photoreceptors or relay neurons) using the HCR in situ (target set) and connectome cell centroids (source set). The source set was then rotated to an approximate orientation to the target set. Next, the Coherent Point Drift Algorithm was used to calculate an affine transformation matrix between the source set and the target set of cells (<a href="#bib32">Myronenko and Song, 2010</a>). This algorithm models the source set as a Gaussian Mixture Model (GMM), and the target set is treated as observations from the GMM. The transformation matrix is calculated to maximize the Maximum A Posteriori estimation that the observed point cloud is drawn from the GMM. A nearest neighbor mapping based on Euclidean distance is then used to find the closest corresponding point in the target cell set for each cell in the transformed source cell set. The implementation used was adapted from the pure Python implementation <a href="https://github.com/siavashk/pycpd">https://github.com/siavashk/pycpd</a>. The maximum number of iterations was set to 1000 and the maximum root mean squared error for convergence was set to 0.001. The code for the registration is available as supplementary material (<a href="/articles/44753/figures#scode1">Source codes 1</a>–<a href="/articles/44753/figures#scode3">3</a>).</p>
+<section
+ class="article-section "
+ id="s4-5-1"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Confusion matrix</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-5-1" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Each dataset containing NT information was registered to every other dataset of the same type using the algorithm detailed above. The EM-registration based cell assignments of each cell in both sets is then compared to each other to see if they agree (<a href="#bib49">Stehman, 1997</a>). The confusion matrix shows the number of times a cell assignment in one dataset corresponds with each other cell assignment in another dataset.</p>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-6"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Behavioral assays</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">For time-lapse videos the inverted lid of a 60 mm petri dish was first coated with a thin layer of 1% agarose. Larvae were then added to the inverted lid with filtered sea water containing 0.1% BSA with streptomycin and kanamycin each at 20 μg/ml. Finally the dish was covered with a square of glass leaving no air at the top interface. Stock solutions of perampanel were dissolved in methanol and diluted to final concentrations of either 5 μm (Santa Cruz Biotech) or 15 µM (Adooq Bioscience) in filtered sea water/BSA/antibiotics. Picrotoxin (Tocris) was also diluted in methanol and used at a final concentration of 1 mM. Control samples received methanol alone.</p>
+<p class="paragraph">Time-lapse images were collected using a Hamamatsu Orca-ER camera fitted on a Navitar 7000 macro zoom lens. Programmable 700 nm and 505 nm LED lamps were used to illuminate the larvae (Mightex). All light intensity readings were taken with an Extech Instruments light meter.</p>
+<section
+ class="article-section "
+ id="s4-6-1"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Dimming-response</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-6-1" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">All larvae used were between 25 and 28 hpf (18°C). For image capture, the larvae were illuminated with the 700 nm LED lamp and the camera was fitted with a red filter to block the 505 nm light. The videos were recorded at five fps. In the assays, larvae were first recorded for 10 s with the 505 nm LED light mounted above the dish at 600 lux and then dimmed to specific values while image capture continued for another 3 min. Larvae were allowed to recover for 5 min before being assayed again.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-6-2"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Phototaxis</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-6-2" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">All larvae used were approximately 25 hpf (18°C). The 505 nm LED light was mounted to one side to the petri dish at approximately 3000 lux. Images were captured at one frame per minute for five hours, with the exception of 30 s capture session at 8.9 fps to assay swimming behavior.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-6-3"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Spontaneous Swims</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-6-3" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">All larvae used were between 26 and 28 hpf. The plates were illuminated with only a 700 nm LED light in order to record dark conditions. The videos were recorded at about 8.9 fps for one minute.</p>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-7"
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Behavioral data analysis</h3>
+ </header>
+
+ <div class="article-section__body">
+ <section
+ class="article-section "
+ id="s4-7-1"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Dim-response criteria</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-7-1" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Responses to light dimming were counted if: (1) the larva was stationary at the time of the light dimming, and (2) it swam for longer than 3 s. Three seconds was determined by measuring the duration of tail flicks as previously described (<a href="#bib46">Salas et al., 2018</a>). Larvae that bumped or brushed against other larvae or the dish edges were not counted.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-7-2"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Tracking and quantification</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-7-2" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Larval swims were tracked using a custom MATLAB script named Estimators of Locomotion Iterations for Animal Experiments (ELIANE). Before uploading to ELIANE, time-lapse images were first processed with Fiji (ImageJ) by subtracting a minimum Z-projection to all the frames and then inverting black and white. ELIANE takes the processed time-lapse images and first creates a background image by averaging the pixels from all the frames. Next, it goes to the initial frame, subtracts the background image, and stores all remaining objects found in the specified region of interest (ROI) as initial objects. Then, analyzing one-by-one the initial objects, it goes frame-by-frame subtracting the background image and analyzing all objects to determine the new position of the object by comparing the Euclidean distances of it to all other objects in that frame. If the object had moved unrealistically fast (&gt;6.5 mm/s), moved outside the ROI, or did not move after a set time (1 min), the object was not analyzed. This MATLAB script can be found in the Supplemental Materials (<a href="/articles/44753/figures#scode4">Source code 4</a>).</p>
+<p class="paragraph">The spontaneous swims in the <i>frimousse</i> experiment were quantified manually.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-7-3"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Sampling</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-7-3" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Assessment of larval swim parameters were performed using three independent assays. For the spontaneous swims, which were quantified manually, 25 larvae were selected randomly, starting from the center of the plate going outward, only using the ones that could be tracked for the entire minute recording session.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+ id="s4-7-4"
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">Tests of significance</h4>
+ <a href="https://bio-protocol.org/eLIFErap44753?item=s4-7-4" class="article-section__header_link">Request a detailed protocol</a>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">Dimming response significance and swim frequency were calculated using the Wilcoxon rank-sum test; spontaneous swim time significance was calculated using the Student’s <i>t</i>-test; and the variance of spontaneous swimming significance was calculated using the F-test.</p>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+ <button class="speech-bubble speech-bubble--has-placeholder"
+ data-behaviour="SpeechBubble HypothesisOpener"
+
+aria-live="polite">
+ <span class="speech-bubble__inner"><span aria-hidden="true"><span data-visible-annotation-count>&#8220;</span></span><span class="visuallyhidden"> Open annotations. The current annotation count on this page is <span data-hypothesis-annotation-count>being calculated</span>.</span></span>
+</button>
+
+
+
+ <section
+ class="article-section "
+ id="references"
+ data-behaviour="ArticleSection"
+ data-initial-state="closed"
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">References</h2>
+ </header>
+
+ <div class="article-section__body">
+
+<ol class="reference-list">
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">1</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib1" id="bib1">
+
+
+ <a href="https://doi.org/10.1016/j.coisb.2017.12.005" class="reference__title">Fold-change detection in biological systems</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Adler%22" class="reference__authors_link">M Adler</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:U+Alon%22" class="reference__authors_link">U Alon</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2018)</span>
+
+ <div class="reference__origin"><i>Current Opinion in Systems Biology</i> <b>8</b>:81–89.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1016/j.coisb.2017.12.005" class="doi__link">https://doi.org/10.1016/j.coisb.2017.12.005</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Fold-change+detection+in+biological+systems&amp;author=M+Adler&amp;author=U+Alon&amp;publication_year=2018&amp;journal=Current+Opinion+in+Systems+Biology&amp;volume=8&amp;pages=pp.+81%E2%80%9389" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">2</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib2" id="bib2">
+
+
+ <a href="https://doi.org/10.1038/nrg2102" class="reference__title">Network motifs: <i>theory and experimental approaches</i></a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:U+Alon%22" class="reference__authors_link">U Alon</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2007)</span>
+
+ <div class="reference__origin"><i>Nature Reviews Genetics</i> <b>8</b>:450–461.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1038/nrg2102" class="doi__link">https://doi.org/10.1038/nrg2102</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/17510665" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Network+motifs%3A+theory+and+experimental+approaches&amp;author=U+Alon&amp;publication_year=2007&amp;journal=Nature+Reviews+Genetics&amp;volume=8&amp;pages=pp.+450%E2%80%93461&amp;pmid=17510665" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">3</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib3" id="bib3">
+
+
+
+
+ <div class="reference__title">Evolution of eyes and photoreceptor cell types</div>
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:D+Arendt%22" class="reference__authors_link">D Arendt</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2003)</span>
+
+ <div class="reference__origin"><i>The International Journal of Developmental Biology</i> <b>47</b>:563–571.</div>
+
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/14756332" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Evolution+of+eyes+and+photoreceptor+cell+types&amp;author=D+Arendt&amp;publication_year=2003&amp;journal=The+International+Journal+of+Developmental+Biology&amp;volume=47&amp;pages=pp.+563%E2%80%93571&amp;pmid=14756332" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">4</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib4" id="bib4">
+
+
+ <a href="https://doi.org/10.1038/nn1136" class="reference__title">EXP-1 is an excitatory GABA-gated cation channel</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:AA+Beg%22" class="reference__authors_link">AA Beg</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:EM+Jorgensen%22" class="reference__authors_link">EM Jorgensen</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2003)</span>
+
+ <div class="reference__origin"><i>Nature Neuroscience</i> <b>6</b>:1145–1152.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1038/nn1136" class="doi__link">https://doi.org/10.1038/nn1136</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/14555952" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=EXP-1+is+an+excitatory+GABA-gated+cation+channel&amp;author=AA+Beg&amp;author=EM+Jorgensen&amp;publication_year=2003&amp;journal=Nature+Neuroscience&amp;volume=6&amp;pages=pp.+1145%E2%80%931152&amp;pmid=14555952" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">5</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib5" id="bib5">
+
+
+ <a href="https://doi.org/10.1111/j.1460-9568.2005.04420.x" class="reference__title">GABAergic synaptic transmission modulates swimming in the ascidian larva</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:ER+Brown%22" class="reference__authors_link">ER Brown</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:A+Nishino%22" class="reference__authors_link">A Nishino</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:Q+Bone%22" class="reference__authors_link">Q Bone</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:IA+Meinertzhagen%22" class="reference__authors_link">IA Meinertzhagen</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:Y+Okamura%22" class="reference__authors_link">Y Okamura</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2005)</span>
+
+ <div class="reference__origin"><i>European Journal of Neuroscience</i> <b>22</b>:2541–2548.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1111/j.1460-9568.2005.04420.x" class="doi__link">https://doi.org/10.1111/j.1460-9568.2005.04420.x</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/16307596" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=GABAergic+synaptic+transmission+modulates+swimming+in+the+ascidian+larva&amp;author=ER+Brown&amp;author=A+Nishino&amp;author%5B2%5D=Q+Bone&amp;author%5B3%5D=IA+Meinertzhagen&amp;author%5B4%5D=Y+Okamura&amp;publication_year=2005&amp;journal=European+Journal+of+Neuroscience&amp;volume=22&amp;pages=pp.+2541%E2%80%932548&amp;pmid=16307596" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">49</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib49" id="bib49">
+
+
+ <a href="https://doi.org/10.1016/S0034-4257(97)00083-7" class="reference__title">Selecting and interpreting measures of thematic classification accuracy</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:SV+Stehman%22" class="reference__authors_link">SV Stehman</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(1997)</span>
+
+ <div class="reference__origin"><i>Remote Sensing of Environment</i> <b>62</b>:77–89.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1016/S0034-4257(97)00083-7" class="doi__link">https://doi.org/10.1016/S0034-4257(97)00083-7</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Selecting+and+interpreting+measures+of+thematic+classification+accuracy&amp;author=SV+Stehman&amp;publication_year=1997&amp;journal=Remote+Sensing+of+Environment&amp;volume=62&amp;pages=pp.+77%E2%80%9389" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">50</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib50" id="bib50">
+
+
+ <a href="https://doi.org/10.2307/1542300" class="reference__title">Ciliary Hovering in Larval Lancelets (=Amphioxus)</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:MD+Stokes%22" class="reference__authors_link">MD Stokes</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:ND+Holland%22" class="reference__authors_link">ND Holland</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(1995)</span>
+
+ <div class="reference__origin"><i>The Biological Bulletin</i> <b>188</b>:231–233.</div>
+
+ <span class="doi"><a href="https://doi.org/10.2307/1542300" class="doi__link">https://doi.org/10.2307/1542300</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/29281329" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Ciliary+Hovering+in+Larval+Lancelets+%28%3DAmphioxus%29&amp;author=MD+Stokes&amp;author=ND+Holland&amp;publication_year=1995&amp;journal=The+Biological+Bulletin&amp;volume=188&amp;pages=pp.+231%E2%80%93233&amp;pmid=29281329" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">51</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib51" id="bib51">
+
+
+ <a href="https://doi.org/10.1002/cne.23679" class="reference__title">A comparative examination of neural circuit and brain patterning between the lamprey and amphioxus reveals the evolutionary origin of the vertebrate visual center</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:DG+Suzuki%22" class="reference__authors_link">DG Suzuki</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:Y+Murakami%22" class="reference__authors_link">Y Murakami</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:H+Escriva%22" class="reference__authors_link">H Escriva</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:H+Wada%22" class="reference__authors_link">H Wada</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2015)</span>
+
+ <div class="reference__origin"><i>Journal of Comparative Neurology</i> <b>523</b>:251–261.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1002/cne.23679" class="doi__link">https://doi.org/10.1002/cne.23679</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/25233869" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=A+comparative+examination+of+neural+circuit+and+brain+patterning+between+the+lamprey+and+amphioxus+reveals+the+evolutionary+origin+of+the+vertebrate+visual+center&amp;author=DG+Suzuki&amp;author=Y+Murakami&amp;author%5B2%5D=H+Escriva&amp;author%5B3%5D=H+Wada&amp;publication_year=2015&amp;journal=Journal+of+Comparative+Neurology&amp;volume=523&amp;pages=pp.+251%E2%80%93261&amp;pmid=25233869" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">52</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib52" id="bib52">
+
+
+ <a href="https://doi.org/10.1007/s00427-001-0205-0" class="reference__title">Developmental expression of ascidian neurotransmitter synthesis genes. I. choline acetyltransferase and acetylcholine transporter genes</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:K+Takamura%22" class="reference__authors_link">K Takamura</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:T+Egawa%22" class="reference__authors_link">T Egawa</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:S+Ohnishi%22" class="reference__authors_link">S Ohnishi</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:T+Okada%22" class="reference__authors_link">T Okada</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:T+Fukuoka%22" class="reference__authors_link">T Fukuoka</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2002)</span>
+
+ <div class="reference__origin"><i>Development Genes and Evolution</i> <b>212</b>:50–53.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1007/s00427-001-0205-0" class="doi__link">https://doi.org/10.1007/s00427-001-0205-0</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/11875658" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Developmental+expression+of+ascidian+neurotransmitter+synthesis+genes.+I.+choline+acetyltransferase+and+acetylcholine+transporter+genes&amp;author=K+Takamura&amp;author=T+Egawa&amp;author%5B2%5D=S+Ohnishi&amp;author%5B3%5D=T+Okada&amp;author%5B4%5D=T+Fukuoka&amp;publication_year=2002&amp;journal=Development+Genes+and+Evolution&amp;volume=212&amp;pages=pp.+50%E2%80%9353&amp;pmid=11875658" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">53</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib53" id="bib53">
+
+
+ <a href="https://doi.org/10.2108/zsj.27.191" class="reference__title">Neural map of the larval central nervous system in the ascidian Ciona intestinalis</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:K+Takamura%22" class="reference__authors_link">K Takamura</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:N+Minamida%22" class="reference__authors_link">N Minamida</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:S+Okabe%22" class="reference__authors_link">S Okabe</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2010)</span>
+
+ <div class="reference__origin"><i>Zoological Science</i> <b>27</b>:191–203.</div>
+
+ <span class="doi"><a href="https://doi.org/10.2108/zsj.27.191" class="doi__link">https://doi.org/10.2108/zsj.27.191</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/20141424" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Neural+map+of+the+larval+central+nervous+system+in+the+ascidian+Ciona+intestinalis&amp;author=K+Takamura&amp;author=N+Minamida&amp;author%5B2%5D=S+Okabe&amp;publication_year=2010&amp;journal=Zoological+Science&amp;volume=27&amp;pages=pp.+191%E2%80%93203&amp;pmid=20141424" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">54</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib54" id="bib54">
+
+
+ <a href="https://doi.org/10.1007/978-1-61779-210-6_15" class="reference__title">Ciona genetics</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:MT+Veeman%22" class="reference__authors_link">MT Veeman</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:S+Chiba%22" class="reference__authors_link">S Chiba</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:WC+Smith%22" class="reference__authors_link">WC Smith</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2011)</span>
+
+ <div class="reference__origin"><i>Methods in Molecular Biology</i> <b>770</b>:401–422.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1007/978-1-61779-210-6_15" class="doi__link">https://doi.org/10.1007/978-1-61779-210-6_15</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/21805273" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Ciona+genetics&amp;author=MT+Veeman&amp;author=S+Chiba&amp;author%5B2%5D=WC+Smith&amp;publication_year=2011&amp;journal=Methods+in+Molecular+Biology&amp;volume=770&amp;pages=pp.+401%E2%80%93422&amp;pmid=21805273" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">55</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib55" id="bib55">
+
+
+ <a href="https://doi.org/10.1073/pnas.1207580109" class="reference__title">Molecular analysis of the amphioxus frontal eye unravels the evolutionary origin of the retina and pigment cells of the vertebrate eye</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:P+Vopalensky%22" class="reference__authors_link">P Vopalensky</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:J+Pergner%22" class="reference__authors_link">J Pergner</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Liegertova%22" class="reference__authors_link">M Liegertova</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:E+Benito-Gutierrez%22" class="reference__authors_link">E Benito-Gutierrez</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:D+Arendt%22" class="reference__authors_link">D Arendt</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:Z+Kozmik%22" class="reference__authors_link">Z Kozmik</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2012)</span>
+
+ <div class="reference__origin"><i>PNAS</i> <b>109</b>:15383–15388.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1073/pnas.1207580109" class="doi__link">https://doi.org/10.1073/pnas.1207580109</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/22949670" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Molecular+analysis+of+the+amphioxus+frontal+eye+unravels+the+evolutionary+origin+of+the+retina+and+pigment+cells+of+the+vertebrate+eye&amp;author=P+Vopalensky&amp;author=J+Pergner&amp;author%5B2%5D=M+Liegertova&amp;author%5B3%5D=E+Benito-Gutierrez&amp;author%5B4%5D=D+Arendt&amp;author%5B5%5D=Z+Kozmik&amp;publication_year=2012&amp;journal=PNAS&amp;volume=109&amp;pages=pp.+15383%E2%80%9315388&amp;pmid=22949670" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">56</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib56" id="bib56">
+
+
+ <a href="https://doi.org/10.1002/gene.20032" class="reference__title">Identification of neuron-specific promoters in Ciona intestinalis</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:R+Yoshida%22" class="reference__authors_link">R Yoshida</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:D+Sakurai%22" class="reference__authors_link">D Sakurai</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:T+Horie%22" class="reference__authors_link">T Horie</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:I+Kawakami%22" class="reference__authors_link">I Kawakami</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Tsuda%22" class="reference__authors_link">M Tsuda</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:T+Kusakabe%22" class="reference__authors_link">T Kusakabe</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2004)</span>
+
+ <div class="reference__origin"><i>Genesis</i> <b>39</b>:130–140.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1002/gene.20032" class="doi__link">https://doi.org/10.1002/gene.20032</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/15170699" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Identification+of+neuron-specific+promoters+in+Ciona+intestinalis&amp;author=R+Yoshida&amp;author=D+Sakurai&amp;author%5B2%5D=T+Horie&amp;author%5B3%5D=I+Kawakami&amp;author%5B4%5D=M+Tsuda&amp;author%5B5%5D=T+Kusakabe&amp;publication_year=2004&amp;journal=Genesis&amp;volume=39&amp;pages=pp.+130%E2%80%93140&amp;pmid=15170699" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">57</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib57" id="bib57">
+
+
+ <a href="https://doi.org/10.1242/jeb.012864" class="reference__title">Shadow response in the blind cavefish Astyanax reveals conservation of a functional pineal eye</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Yoshizawa%22" class="reference__authors_link">M Yoshizawa</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:WR+Jeffery%22" class="reference__authors_link">WR Jeffery</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2008)</span>
+
+ <div class="reference__origin"><i>Journal of Experimental Biology</i> <b>211</b>:292–299.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1242/jeb.012864" class="doi__link">https://doi.org/10.1242/jeb.012864</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/18203983" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Shadow+response+in+the+blind+cavefish+Astyanax+reveals+conservation+of+a+functional+pineal+eye&amp;author=M+Yoshizawa&amp;author=WR+Jeffery&amp;publication_year=2008&amp;journal=Journal+of+Experimental+Biology&amp;volume=211&amp;pages=pp.+292%E2%80%93299&amp;pmid=18203983" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">58</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib58" id="bib58">
+
+
+ <a href="https://doi.org/10.1523/JNEUROSCI.0141-10.2010" class="reference__title">Synaptic and vesicular coexistence of VGLUT and VGAT in selected excitatory and inhibitory synapses</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:JF+Zander%22" class="reference__authors_link">JF Zander</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:A+M%C3%BCnster-Wandowski%22" class="reference__authors_link">A Münster-Wandowski</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:I+Brunk%22" class="reference__authors_link">I Brunk</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:I+Pahner%22" class="reference__authors_link">I Pahner</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:G+G%C3%B3mez-Lira%22" class="reference__authors_link">G Gómez-Lira</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:U+Heinemann%22" class="reference__authors_link">U Heinemann</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:R+Guti%C3%A9rrez%22" class="reference__authors_link">R Gutiérrez</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:G+Laube%22" class="reference__authors_link">G Laube</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:G+Ahnert-Hilger%22" class="reference__authors_link">G Ahnert-Hilger</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2010)</span>
+
+ <div class="reference__origin"><i>Journal of Neuroscience</i> <b>30</b>:7634–7645.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1523/JNEUROSCI.0141-10.2010" class="doi__link">https://doi.org/10.1523/JNEUROSCI.0141-10.2010</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/20519538" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Synaptic+and+vesicular+coexistence+of+VGLUT+and+VGAT+in+selected+excitatory+and+inhibitory+synapses&amp;author=JF+Zander&amp;author=A+M%C3%BCnster-Wandowski&amp;author%5B2%5D=I+Brunk&amp;author%5B3%5D=I+Pahner&amp;author%5B4%5D=G+G%C3%B3mez-Lira&amp;author%5B5%5D=U+Heinemann&amp;author%5B6%5D=R+Guti%C3%A9rrez&amp;author%5B7%5D=G+Laube&amp;author%5B8%5D=G+Ahnert-Hilger&amp;publication_year=2010&amp;journal=Journal+of+Neuroscience&amp;volume=30&amp;pages=pp.+7634%E2%80%937645&amp;pmid=20519538" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">59</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib59" id="bib59">
+
+
+ <a href="https://doi.org/10.1002/cne.21565" class="reference__title">Developmental expression of glutamic acid decarboxylase and of gamma-aminobutyric acid type B receptors in the ascidian Ciona intestinalis</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:G+Zega%22" class="reference__authors_link">G Zega</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Biggiogero%22" class="reference__authors_link">M Biggiogero</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:S+Groppelli%22" class="reference__authors_link">S Groppelli</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:S+Candiani%22" class="reference__authors_link">S Candiani</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:D+Oliveri%22" class="reference__authors_link">D Oliveri</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Parodi%22" class="reference__authors_link">M Parodi</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:M+Pestarino%22" class="reference__authors_link">M Pestarino</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:F+De+Bernardi%22" class="reference__authors_link">F De Bernardi</a></li>
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:R+Pennati%22" class="reference__authors_link">R Pennati</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2008)</span>
+
+ <div class="reference__origin"><i>The Journal of Comparative Neurology</i> <b>506</b>:489–505.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1002/cne.21565" class="doi__link">https://doi.org/10.1002/cne.21565</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/18041772" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Developmental+expression+of+glutamic+acid+decarboxylase+and+of+gamma-aminobutyric+acid+type+B+receptors+in+the+ascidian+Ciona+intestinalis&amp;author=G+Zega&amp;author=M+Biggiogero&amp;author%5B2%5D=S+Groppelli&amp;author%5B3%5D=S+Candiani&amp;author%5B4%5D=D+Oliveri&amp;author%5B5%5D=M+Parodi&amp;author%5B6%5D=M+Pestarino&amp;author%5B7%5D=F+De+Bernardi&amp;author%5B8%5D=R+Pennati&amp;publication_year=2008&amp;journal=The+Journal+of+Comparative+Neurology&amp;volume=506&amp;pages=pp.+489%E2%80%93505&amp;pmid=18041772" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+ <li class="reference-list__item">
+ <span class="reference-list__ordinal_number">60</span>
+ <div class="reference" data-popup-label="See in references" data-popup-contents="bib60" id="bib60">
+
+
+ <a href="https://doi.org/10.1007/978-981-10-7545-2_5" class="reference__title">Electroporation in ascidians: history, theory and protocols</a>
+
+
+
+
+ <ol class="reference__authors_list">
+ <li class="reference__author">
+ <a href="https://scholar.google.com/scholar?q=%22author:RW+Zeller%22" class="reference__authors_link">RW Zeller</a></li>
+ </ol>
+ <span class="reference__authors_list_suffix">(2018)</span>
+
+ <div class="reference__origin"><i>Advances in Experimental Medicine and Biology</i> <b>1029</b>:37–48.</div>
+
+ <span class="doi"><a href="https://doi.org/10.1007/978-981-10-7545-2_5" class="doi__link">https://doi.org/10.1007/978-981-10-7545-2_5</a></span>
+
+ <ul class="reference__abstracts">
+ <li class="reference__abstract"><a href="https://www.ncbi.nlm.nih.gov/pubmed/29542079" class="reference__abstract_link">PubMed</a></li>
+ <li class="reference__abstract"><a href="https://scholar.google.com/scholar_lookup?title=Electroporation+in+ascidians%3A+history%2C+theory+and+protocols&amp;author=RW+Zeller&amp;publication_year=2018&amp;journal=Advances+in+Experimental+Medicine+and+Biology&amp;volume=1029&amp;pages=pp.+37%E2%80%9348&amp;pmid=29542079" class="reference__abstract_link">Google Scholar</a></li>
+
+ </ul>
+ </div>
+ </li>
+</ol>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+ <section
+ class="article-section "
+ id="SA1"
+ data-behaviour="ArticleSection"
+ data-initial-state="closed"
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Decision letter</h2>
+ </header>
+
+ <div class="article-section__body">
+ <div class="decision-letter-header">
+ <ol class="listing-list">
+ <li class="listing-list__item">
+ <div class="profile-snippet">
+ <div class="profile-snippet__container clearfix">
+
+ <div class="profile-snippet__name">Oliver Hobert</div>
+ <div class="profile-snippet__title">Reviewing Editor; Howard Hughes Medical Institute, Columbia University, United States</div>
+ </div>
+ </div>
+ </li>
+ <li class="listing-list__item">
+ <div class="profile-snippet">
+ <div class="profile-snippet__container clearfix">
+
+ <div class="profile-snippet__name">Ronald L Calabrese</div>
+ <div class="profile-snippet__title">Senior Editor; Emory University, United States</div>
+ </div>
+ </div>
+ </li>
+ </ol>
+ <div class="decision-letter-header__main_text"><p class="paragraph">In the interests of transparency, eLife includes the editorial decision letter and accompanying author responses. A lightly edited version of the letter sent to the authors after peer review is shown, indicating the most substantive concerns; minor comments are not usually included.</p>
+</div>
+</div>
+<p class="paragraph">Thank you for submitting your article "Parallel Visual Circuitry in a Basal Chordate" for consideration by <i>eLife</i>. Your article has been reviewed by Ronald Calabrese as the Senior Editor, a Reviewing Editor, Oliver Hobert, and two reviewers. The reviewers have opted to remain anonymous.</p>
+<p class="paragraph">The reviewers have discussed the reviews with one another and the Reviewing Editor has drafted this decision to help you prepare a revised submission.</p>
+<p class="paragraph">The reviewers – and the Reviewing Editor – agree that the manuscript reports an interesting, exciting set of findings that provide new insight into how visual systems evolve. However, there is also agreement that the evidence behind the GABA receptors being involved in the behavioral response to dimming is entirely indirect, and would be substantially strengthened by a pharmacological parallel to the Glutamate receptor antagonist data. That is, according to the disinhibition model, acute blockade of GABA(A) receptors with a pharmacological antagonist should produce a "hyperactive" movement phenotype akin to the <i>frm</i> mutant animal, but one that should still be capable of phototaxis (but not a dimming response). Such a result would provide an elegant "double dissociation" that would parallel the findings with the AMPA receptor antagonist.</p>
+<p class="paragraph">There is also agreement that the manuscript requires an extensive revision to the Introduction that puts the work in a broader context. At present, the manuscript begins largely with a description of the <i>Ciona</i> connectome, in relation to other complete connectomes, and then plunges directly into a more detailed description of ganglia, cells and synapses. A broader audience could be engaged by the work if the authors identified the key question of interest, and provides some of the background material currently found in the Discussion section, before diving into the pertinent details.</p>
+<p class="paragraph"><i>Reviewer #1:</i></p>
+<p class="paragraph">How the functional architecture of visual systems has evolved to subserve different behavioral goals is a fundamental question of broad interest. At present, while we have a deep understanding of visual system organization in a few experimental models, such a fundamental question can be enriched through the exploration of evolutionarily divergent organisms. In this context, Smith and colleagues integrate a new description of neurotransmitter expression patterns, ultrastructural connectivity, pharmacology and behavior to derive new insights into the architecture of the Ascidian <i>Ciona</i> visual system.</p>
+<p class="paragraph">First, by mapping RNA expression patterns onto neurons spanning the <i>Ciona</i> nervous system using a combination of HCR in situs and image registration, they assign neurotransmitter types to many neurons. Importantly, these studies reveal three classes of ocellus photoreceptors – one that uses glutamate as a transmitter, one that uses GABA, and one that appears to release both. Next, using a glutamate receptor antagonist, they demonstrate that blockade of signaling from glutamatergic photoreceptors blocks phototaxis, but does not affect a second behavior evoked by transient dimming. Finally, consistent with the idea that a subset of photoreceptors could control the dimming response by depolarizing to darkness, and releasing GABA, the authors describe a mutant in which visual input to motor pathways is disrupted, leading to an animal that swims constitutively.</p>
+<p class="paragraph">Overall, this manuscript reports an interesting, exciting set of findings that provide new insight into how visual systems evolve. I find the idea that there might be photoreceptors that appear to hyperpolarize to light and release GABA particularly exciting, and it will be fascinating to learn more about how these photoreceptors are related to retinal and pineal photoreceptors in vertebrates. However, I do feel that the evidence behind these receptors being involved in the behavioral response to dimming is entirely indirect, and would be substantially strengthened by a pharmacological parallel to the Glutamate receptor antagonist data. That is, according to the disinhibition model, acute blockade of GABA(A) receptors with a pharmacological antagonist should produce a "hyperactive" movement phenotype akin to the <i>frm</i> mutant animal, but one that should still be capable of phototaxis (but not a dimming response). Such a result would provide an elegant "double dissociation" that would parallel the findings with the AMPA receptor antagonist.</p>
+<p class="paragraph"><i>Reviewer #2:</i></p>
+<p class="paragraph">The fact that there is a full map of connections in <i>Ciona</i> provides a great opportunity to dissect circuits. Even better, the tools are there to perform some genetic and pharmacological pertubations, and evaluate effects on behavior. This study begins to exploit these features in a study of the <i>Ciona</i> visual system. The authors dug deeper into two circuits that begin with photoreception. They used transgenic reporter animals and in situ hybridization to define the use of two classical neurotransmitters, glutamate and GABA. Surprisingly, one type of photoreceptor uses GABA, an inhibitory neurotransmitter not previously described as used by photoreceptors in any species. From the known connections, they also make a case for how the two circuits are connected, and further suggest that one of the circuits is disinhibitory, perhaps along with other sensory inputs, for oscillatory swimming behavior. Through the use of a specific antagonist for a glutamate receptor they are able to show that one of the photoreceptor circuits is involved with detection of the direction of light (phototaxis), using a behavioral assay. Interestingly, inhibition of phototaxis has no effect on the other circuit, which detects dimming. However, it is likely that there is cross talk between the two photoreceptor circuits, as suggested by the known anatomy.</p>
+<p class="paragraph">Overall this study provides a very nice example of photoreceptor directed behavior as controlled by two different circuits. It provides food for though regarding the evolution of different types of visually guided behaviors and the use of different types of photoreceptors. Optogenetic manipulations and calcium imaging (tried by the authors but did not work due to technical limitations) would greatly add to this story, but as it stands it constitutes a very nice addition to our understanding of a sensory circuit and behavior.</p>
+
+
+
+
+ <span class="doi doi--article-section"><a href="https://doi.org/10.7554/eLife.44753.036" class="doi__link">https://doi.org/10.7554/eLife.44753.036</a></span>
+ </div>
+
+</section>
+
+
+
+ <section
+ class="article-section "
+ id="SA2"
+ data-behaviour="ArticleSection"
+ data-initial-state="closed"
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Author response</h2>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph"><i>The reviewers – and the reviewing editor – agree that the manuscript reports an interesting, exciting set of findings that provide new insight into how visual systems evolve. […] A broader audience could be engaged by the work if the authors identified the key question of interest, and provides some of the background material currently found in the Discussion section, before diving into the pertinent details.</i></p>
+<p class="paragraph">In our revised manuscript we have thoroughly addressed the reviewers concerns and have included extensive new data from behavioral studies using a GABA receptor antagonist (Figure 7 in the revised manuscript, and related text). As you will read in the text, our results with the GABA receptor antagonist (picrotoxin) agree thoroughly with our disinhibition model (and with our observations of the <i>frm</i> mutant). The use of the GABA receptor antagonist was an excellent suggestion, and we feel that the results presented here greatly strengthen our model. As you will see in Figure 7, picrotoxin (like the <i>frm</i> mutant) leads to increased spontaneous swimming. Moreover, picrotoxin also leads to a dramatic reduction in the dimming response. We then show with use of picrotoxin combined with the AMPAR antagonist perampanel that the residual dimming response is due to parallel activation of the excitatory circuit. Finally, we show that picrotoxin-treated larvae are still capable of phototaxis. However, we observed that the phototaxis ability of the picrotoxin-treated larvae was somewhat dampened in comparison to controls, which we attribute to excitotoxicity of prolonged picrotoxin exposure (Movie5 documents the toxicity of prolonged picrotoxin exposure).</p>
+<p class="paragraph">We have also extensively rewritten the Introduction along the lines suggested by the reviewer. Additionally, as requested, we have included in the text the number of animals tested using the pOpsin1/VGAT Kaede combination (n=5). Finally, we have collected additional data on neurotransmitter use by cells of the motor ganglion. These additional data are presented in revised versions of Figure 4 and Figure 4-figure supplement 1. Our conclusions regarding the minimal circuit are unchanged by this additional data; however, we are revising our neurotransmitter assignment to the anterior pair of ACINs. This reassignment was undertaken after consultation with Kerrianne Ryan (author of the <i>Ciona</i> connectome manuscript). We also include an approved personal communication from Dr. Ryan in this section.</p>
+
+
+
+
+ <span class="doi doi--article-section"><a href="https://doi.org/10.7554/eLife.44753.037" class="doi__link">https://doi.org/10.7554/eLife.44753.037</a></span>
+ </div>
+
+</section>
+
+
+
+ <section
+ class="article-section "
+ id="info"
+ data-behaviour="ArticleSection"
+ data-initial-state="closed"
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Article and author information</h2>
+ </header>
+
+ <div class="article-section__body">
+ <h3 class="authors-details__heading">Author details</h3>
+<ol class="authors-details__authors">
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="x8d8d9914" id="x8d8d9914">
+
+ <h4 class="author-details__name">Matthew J Kourakis</h4>
+
+ <section class="author-details__section">
+ <span class="author-details__text">Neuroscience Research Institute, University of California, Santa Barbara, Santa Barbara, United States</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Data curation, Formal analysis, Supervision, Investigation, Methodology, Writing—review and editing</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contributed equally with</h5>
+ <span class="author-details__text">Cezar Borba</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+ <section class="author-details__section">
+ <span class="orcid">
+ <a href="https://orcid.org/0000-0002-1261-3811">
+ <picture>
+ <source srcset="/assets/patterns/img/icons/orcid.b96370b9.svg" type="image/svg+xml">
+ <img src="/assets/patterns/img/icons/orcid.10f6112b.png" class="orcid__icon"
+ alt="ORCID icon">
+ </picture> <span class="visuallyhidden">"This ORCID iD identifies the author of this article:"</span>
+ 0000-0002-1261-3811</a>
+ </span>
+ </section>
+
+
+</div>
+
+</li>
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="xf3e51472" id="xf3e51472">
+
+ <h4 class="author-details__name">Cezar Borba</h4>
+
+ <section class="author-details__section">
+ <span class="author-details__text">Department of Molecular, Cell and Developmental Biology, University of California, Santa Barbara, Santa Barbara, United States</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Software, Formal analysis, Investigation, Methodology, Writing—review and editing</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contributed equally with</h5>
+ <span class="author-details__text">Matthew J Kourakis</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+
+
+</div>
+
+</li>
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="xb536f34f" id="xb536f34f">
+
+ <h4 class="author-details__name">Angela Zhang</h4>
+
+ <section class="author-details__section">
+ <span class="author-details__text">Department of Electrical and Computer Engineering, University of California, Santa Barbara, Santa Barbara, United States</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Software, Formal analysis, Methodology, Writing—review and editing</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+
+
+</div>
+
+</li>
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="x1d85dfc3" id="x1d85dfc3">
+
+ <h4 class="author-details__name">Erin Newman-Smith</h4>
+
+ <section class="author-details__section">
+ <ol class="author-details__list list list--bullet">
+ <li class="author-details__text">Neuroscience Research Institute, University of California, Santa Barbara, Santa Barbara, United States</li>
+ <li class="author-details__text">Department of Molecular, Cell and Developmental Biology, University of California, Santa Barbara, Santa Barbara, United States</li>
+ </ol>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Conceptualization, Formal analysis, Investigation, Methodology, Writing—review and editing</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+
+
+</div>
+
+</li>
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="x6107dd5d" id="x6107dd5d">
+
+ <h4 class="author-details__name">Priscilla Salas</h4>
+
+ <section class="author-details__section">
+ <span class="author-details__text">Department of Molecular, Cell and Developmental Biology, University of California, Santa Barbara, Santa Barbara, United States</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Investigation, Writing—review and editing</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+
+
+</div>
+
+</li>
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="x8b937bbf" id="x8b937bbf">
+
+ <h4 class="author-details__name">B Manjunath</h4>
+
+ <section class="author-details__section">
+ <span class="author-details__text">Neuroscience Research Institute, University of California, Santa Barbara, Santa Barbara, United States</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Conceptualization, Supervision, Funding acquisition, Project administration, Writing—review and editing</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+
+
+</div>
+
+</li>
+ <li class="authors-details__author"><div class="author-details" data-popup-contents="xa3814a31" id="xa3814a31">
+
+ <h4 class="author-details__name">William C Smith</h4>
+
+ <section class="author-details__section">
+ <ol class="author-details__list list list--bullet">
+ <li class="author-details__text">Neuroscience Research Institute, University of California, Santa Barbara, Santa Barbara, United States</li>
+ <li class="author-details__text">Department of Molecular, Cell and Developmental Biology, University of California, Santa Barbara, Santa Barbara, United States</li>
+ </ol>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Contribution</h5>
+ <span class="author-details__text">Conceptualization, Funding acquisition, Writing—original draft, Project administration</span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">For correspondence</h5>
+ <span class="author-details__text"><a href="mailto:w_smith@ucsb.edu">w_smith@ucsb.edu</a></span>
+ </section>
+ <section class="author-details__section">
+ <h5 class="author-details__heading">Competing interests</h5>
+ <span class="author-details__text">No competing interests declared</span>
+ </section>
+
+ <section class="author-details__section">
+ <span class="orcid">
+ <a href="https://orcid.org/0000-0002-6257-7695">
+ <picture>
+ <source srcset="/assets/patterns/img/icons/orcid.b96370b9.svg" type="image/svg+xml">
+ <img src="/assets/patterns/img/icons/orcid.10f6112b.png" class="orcid__icon"
+ alt="ORCID icon">
+ </picture> <span class="visuallyhidden">"This ORCID iD identifies the author of this article:"</span>
+ 0000-0002-6257-7695</a>
+ </span>
+ </section>
+
+
+</div>
+
+</li>
+</ol>
+<section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Funding</h3>
+ </header>
+
+ <div class="article-section__body">
+ <section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h4 class="article-section__header_text">National Institute of Neurological Disorders and Stroke (R01NS103774)</h4>
+ </header>
+
+ <div class="article-section__body">
+
+
+ <ul class="list list--bullet">
+ <li>William C Smith</li>
+ </ul>
+
+
+
+
+
+ </div>
+
+</section>
+<p class="paragraph">The funders had no role in study design, data collection and interpretation, or the decision to submit the work for publication.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Acknowledgements</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p class="paragraph">We thank Takeo Horie and Takahiro Kusakabe for the opsin1 promoter construct; Yasunori Sasakura for the stable pVGAT &gt;kaede line and pVACHT &gt;CFP plasmid; Haruo Okado for the pAMPAR &gt;GFP construct. Kerrianne Ryan for her helpful discussion and sharing unpublished data. Chelsea Parlett-Pelleriti for her advice on statistical analysis. We acknowledge the use of the NRI-MCDB Microscopy Facility and the Resonant Scanning Confocal supported by NSF MRI grant 1625770. This work supported by an award from NIH (NS103774) to WCS and BM.</p>
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Senior Editor</h3>
+ </header>
+
+ <div class="article-section__body">
+
+ <ol class="list">
+ <li>Ronald L Calabrese, Emory University, United States</li>
+ </ol>
+
+
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Reviewing Editor</h3>
+ </header>
+
+ <div class="article-section__body">
+
+ <ol class="list">
+ <li>Oliver Hobert, Howard Hughes Medical Institute, Columbia University, United States</li>
+ </ol>
+
+
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Publication history</h3>
+ </header>
+
+ <div class="article-section__body">
+
+ <ol class="list list--bullet">
+ <li>Received: December 28, 2018</li>
+ <li>Accepted: April 11, 2019</li>
+ <li>Accepted Manuscript published: <a href="/articles/44753v1">April 18, 2019 (version 1)</a></li>
+ <li>Version of Record published: <a href="/articles/44753">May 3, 2019 (version 2)</a></li>
+ </ol>
+
+
+
+
+
+
+ </div>
+
+</section>
+<section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h3 class="article-section__header_text">Copyright</h3>
+ </header>
+
+ <div class="article-section__body">
+ <p>© 2019, Kourakis et al.</p><p>This article is distributed under the terms of the <a href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License</a>, which permits unrestricted use and redistribution provided that the original author and source are credited.</p>
+
+
+
+ </div>
+
+</section>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+ <section
+ class="article-section "
+ id="metrics"
+ data-behaviour="ArticleSection"
+ data-initial-state="closed"
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Metrics</h2>
+ </header>
+
+ <div class="article-section__body">
+ <ul class="statistic-collection clearfix">
+ <li class="statistic-collection__item">
+ <dl class="statistic">
+ <dd class="statistic__value">
+ 807
+ </dd>
+ <dt class="statistic__label">
+ Page views
+ </dt>
+ </dl>
+ </li>
+ <li class="statistic-collection__item">
+ <dl class="statistic">
+ <dd class="statistic__value">
+ 173
+ </dd>
+ <dt class="statistic__label">
+ Downloads
+ </dt>
+ </dl>
+ </li>
+ <li class="statistic-collection__item">
+ <dl class="statistic">
+ <dd class="statistic__value">
+ 0
+ </dd>
+ <dt class="statistic__label">
+ Citations
+ </dt>
+ </dl>
+ </li>
+</ul>
+<p class="paragraph">Article citation count generated by polling the highest count across the following sources: <a href="">Crossref</a>, <a href="">PubMed Central</a>, <a href="">Scopus</a>.</p>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+ <section
+ class="article-section "
+
+
+
+>
+
+ <header class="article-section__header">
+ <h2 class="article-section__header_text">Download links</h2>
+ </header>
+
+ <div class="article-section__body">
+ <div data-behaviour="ArticleDownloadLinksList" id="downloads" aria-labelledby="downloads-label">
+ <div class="visuallyhidden"><span id="downloads-label">A two-part list of links to download the article, or parts of the article, in various formats.</span></div>
+
+ <h3 class="article-download-links-list__heading">Downloads<span class="visuallyhidden"> (link to download the article as PDF)</span></h3>
+ <ul class="article-download-list">
+ <li><a href="https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D" class="article-download-links-list__link"
+
+ data-article-identifier="10.7554/eLife.44753"
+ data-download-type="pdf-article"
+
+ >Article PDF</a></li>
+ <li><a href="https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtZmlndXJlcy12Mi5wZGY=/elife-44753-figures-v2.pdf?_hash=x4qA8GP%2BKBA2SOVJsL3falcqZCUNChW4fqaIfFIjgHk%3D" class="article-download-links-list__link"
+
+ data-article-identifier="10.7554/eLife.44753"
+ data-download-type="pdf-figures"
+
+ >Figures PDF</a></li>
+ </ul>
+ <h3 class="article-download-links-list__heading">Download citations<span class="visuallyhidden"> (links to download the citations from this article in formats compatible with various reference manager tools)</span></h3>
+ <ul class="article-download-list">
+ <li><a href="/articles/44753.bib" class="article-download-links-list__link"
+
+
+ >BibTeX</a></li>
+ <li><a href="/articles/44753.ris" class="article-download-links-list__link"
+
+
+ >RIS</a></li>
+ </ul>
+ <h3 class="article-download-links-list__heading">Open citations<span class="visuallyhidden"> (links to open the citations from this article in various online reference manager services)</span></h3>
+ <ul class="article-download-list">
+ <li><a href="https://www.mendeley.com/import?doi=10.7554/eLife.44753" class="article-download-links-list__link"
+
+
+ >Mendeley</a></li>
+ <li><a href="https://www.readcube.com/articles/10.7554/eLife.44753" class="article-download-links-list__link"
+
+
+ >ReadCube</a></li>
+ <li><a href="papers2://url/https%3A%2F%2Felifesciences.org%2Farticles%2F44753?title=Parallel+visual+circuitry+in+a+basal+chordate" class="article-download-links-list__link"
+
+
+ >Papers</a></li>
+ <li><a href="http://www.citeulike.org/posturl?url=https%3A%2F%2Felifesciences.org%2Farticles%2F44753&amp;title=Parallel+visual+circuitry+in+a+basal+chordate&amp;doi=10.7554/eLife.44753" class="article-download-links-list__link"
+
+
+ >CiteULike</a></li>
+ </ul>
+
+</div>
+
+
+
+
+ </div>
+
+</section>
+
+
+
+
+<section class="article-meta">
+
+ <div class="article-meta__container">
+
+
+ <section class="article-meta__group">
+ <h4 class="article-meta__group_title">Categories and tags</h4>
+ <ul class="article-meta__link_list">
+ <li class="article-meta__link_list_item">
+ <a href="/articles/research-article" class="article-meta__link">Research Article</a></li>
+ <li class="article-meta__link_list_item">
+ <a href="/subjects/neuroscience" class="article-meta__link">Neuroscience</a></li>
+ <li class="article-meta__link_list_item">
+ <a href="/search?for=connectome" class="article-meta__link">connectome</a></li>
+ <li class="article-meta__link_list_item">
+ <a href="/search?for=visuomotor" class="article-meta__link">visuomotor</a></li>
+ <li class="article-meta__link_list_item">
+ <a href="/search?for=behavior" class="article-meta__link">behavior</a></li>
+ </ul>
+ </section>
+
+
+ <section class="article-meta__group">
+ <h4 class="article-meta__group_title">Research organism</h4>
+ <ul class="article-meta__link_list">
+ <li class="article-meta__link_list_item">
+ <a href="/search?for=C.%20intestinalis" class="article-meta__link"><i>C. intestinalis</i></a></li>
+ </ul>
+ </section>
+
+
+ </div>
+
+</section>
+
+
+
+
+
+
+
+ </div>
+
+
+ <div class="grid__item one-whole
+
+ large--four-twelfths x-large--three-twelfths
+ grid-secondary-column">
+
+ <div class="grid-secondary-column__item grid-secondary-column__item--wide-only">
+
+ <div>
+
+
+ <ol class="listing-list ">
+ <li class="listing-list__item"><div class="teaser teaser--secondary teaser--related ">
+
+ <ol class="teaser__context_label_list" aria-label="These research categories are for the following article">
+ <li class="teaser__context_label_item">
+
+ <span class="teaser__context_label">Of interest</span>
+ </li>
+ </ol>
+
+ <header class="teaser__header">
+
+
+ <h4 class="teaser__header_text">
+ <a href="/articles/48779" class="teaser__header_text_link">An arbitrary-spectrum spatial visual stimulator for vision research</a>
+ </h4>
+
+ <div class="teaser__secondary_info">
+ Katrin Franke et al.
+ </div>
+
+ </header>
+
+
+ <footer class="teaser__footer">
+
+ <div class="meta">
+
+ <a class="meta__type" href="/articles/tools-resources" >Tools and Resources</a>
+
+
+
+ <span class="date"> Updated <time datetime="2019-10-08">Oct 8, 2019</time></span>
+ </div>
+
+
+ </footer>
+</div>
+</li><li class="listing-list__item"><a href="#listing" class="see-more-link">Further reading</a>
+</li></ol>
+
+
+</div>
+
+
+ </div>
+
+ </div>
+
+
+ </div>
+
+</div>
+
+
+ <div class="wrapper listing-read-more">
+
+ <div class="grid">
+
+ <div class="content-container grid__item
+ one-whole
+ large--ten-twelfths
+ push--large--one-twelfth
+ x-large--eight-twelfths
+ push--x-large--two-twelfths
+ grid-column">
+
+ <div class="listing-list-heading">
+ <h3 class="list-heading">Further reading</h3>
+ </div>
+
+ <ol class="listing-list listing-list--read-more" id="listing">
+ <li class="listing-list__item listing-list__item--related">
+ <div class="listing-list__divider"></div>
+ <header class="content-header content-header--read-more clearfix content-header--header">
+
+ <ol class="content-header__subject_list">
+ <li class="content-header__subject_list_item">
+ <span class="content-header__subject">Neuroscience</span>
+ </li>
+ </ol>
+
+ <div class="content-header__body">
+ <h1 class="content-header__title content-header__title--long">
+ <a href="/articles/48779" class="content-header__title_link">An arbitrary-spectrum spatial visual stimulator for vision research</a>
+ </h1>
+ </div>
+
+ <div class="content-header__authors content-header__authors--line">Katrin Franke et al.</div>
+
+ <div class="content-header__meta">
+ <div class="meta">
+
+ <a class="meta__type" href="/articles/tools-resources" >Tools and Resources</a>
+
+
+
+ <span class="date"> Updated <time datetime="2019-10-08">Oct 8, 2019</time></span>
+ </div>
+ </div>
+
+ </header>
+ </li>
+ <li class="listing-list__item ">
+ <div class="listing-list__divider"></div>
+ <header class="content-header content-header--read-more clearfix content-header--header">
+
+ <ol class="content-header__subject_list">
+ <li class="content-header__subject_list_item">
+ <span class="content-header__subject">Neuroscience</span>
+ </li>
+ </ol>
+
+ <div class="content-header__body">
+ <h1 class="content-header__title content-header__title--long">
+ <a href="/articles/47996" class="content-header__title_link">Self-organization of modular network architecture by activity-dependent neuronal migration and outgrowth</a>
+ </h1>
+ </div>
+
+ <div class="content-header__authors content-header__authors--line">Samora Okujeni, Ulrich Egert</div>
+
+ <div class="content-header__meta">
+ <div class="meta">
+
+ <a class="meta__type" href="/articles/research-article" >Research Article</a>
+
+
+
+ <span class="date"> Updated <time datetime="2019-10-08">Oct 8, 2019</time></span>
+ </div>
+ </div>
+
+ </header>
+ </li>
+ <li class="listing-list__item ">
+ <div class="listing-list__divider"></div>
+ <header class="content-header content-header--read-more clearfix content-header--header">
+
+ <ol class="content-header__subject_list">
+ <li class="content-header__subject_list_item">
+ <span class="content-header__subject">Neuroscience</span>
+ </li>
+ </ol>
+
+ <div class="content-header__body">
+ <h1 class="content-header__title content-header__title--long">
+ <a href="/articles/48114" class="content-header__title_link">Pretectal neurons control hunting behaviour</a>
+ </h1>
+ </div>
+
+ <div class="content-header__authors content-header__authors--line">Paride Antinucci et al.</div>
+
+ <div class="content-header__meta">
+ <div class="meta">
+
+ <a class="meta__type" href="/articles/research-article" >Research Article</a>
+
+
+
+ <span class="date"> <time datetime="2019-10-08">Oct 8, 2019</time></span>
+ </div>
+ </div>
+
+ </header>
+ </li>
+
+ </ol>
+
+
+ </div>
+
+ </div>
+
+</div>
+
+
+
+
+ </div>
+
+
+ </main>
+
+ <section class="email-cta">
+
+</section>
+
+
+ <div class="main-menu" id="mainMenu" data-behaviour="MainMenu" tabindex="0">
+ <nav class="main-menu__container" role="navigation">
+ <h3 class="list-heading">Menu</h3>
+ <ul class="main-menu__list">
+ <li class="main-menu__list_item">
+ <a href="/subjects" class="main-menu__list_link">Research categories</a>
+ </li>
+ <li class="main-menu__list_item">
+ <a href="https://submit.elifesciences.org/html/elife_author_instructions.html" class="main-menu__list_link">Author guide</a>
+ </li>
+ <li class="main-menu__list_item">
+ <a href="https://submit.elifesciences.org/html/elife_reviewer_instructions.html" class="main-menu__list_link">Reviewer guide</a>
+ </li>
+ <li class="main-menu__list_item">
+ <a href="/about" class="main-menu__list_link">About</a>
+ </li>
+ <li class="main-menu__list_item">
+ <a href="/inside-elife" class="main-menu__list_link">Inside eLife</a>
+ </li>
+ <li class="main-menu__list_item">
+ <a href="/community" class="main-menu__list_link">Community</a>
+ </li>
+ <li class="main-menu__list_item">
+ <a href="/labs" class="main-menu__list_link">Innovation</a>
+ </li>
+ </ul>
+ <a href="#siteHeader" class="to-top-link">Back to top</a>
+ </nav>
+ </div>
+
+<ol class="investor-logos" role="contentinfo" aria-label="eLife is funded by these organisations">
+ <li class="investor-logos__item">
+
+ <div class="investor-logos__container">
+ <picture class="investor-logos__picture">
+ <source srcset="/assets/images/investors/hhmi.9d0951a2.svg"
+ type="image/svg+xml"
+ >
+ <source srcset="/assets/images/investors/hhmi@2x.e63a8d68.webp 2x, /assets/images/investors/hhmi@1x.c1e8d1b9.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="/assets/images/investors/hhmi@2x.58718155.png 2x, /assets/images/investors/hhmi@1x.ad4627a8.png 1x"
+ type="image/png"
+ >
+ <img src="/assets/images/investors/hhmi@1x.ad4627a8.png"
+
+ alt="Howard Hughes Medical Institute"
+ class="investor-logos__img"
+ >
+ </picture>
+ </div>
+
+ </li>
+ <li class="investor-logos__item">
+
+ <div class="investor-logos__container">
+ <picture class="investor-logos__picture">
+ <source srcset="/assets/images/investors/wellcome.813f8634.svg"
+ type="image/svg+xml"
+ >
+ <source srcset="/assets/images/investors/wellcome@2x.993dd002.webp 2x, /assets/images/investors/wellcome@1x.1fd7fa84.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="/assets/images/investors/wellcome@2x.75f8d6f9.png 2x, /assets/images/investors/wellcome@1x.ff6d9292.png 1x"
+ type="image/png"
+ >
+ <img src="/assets/images/investors/wellcome@1x.ff6d9292.png"
+
+ alt="Wellcome Trust"
+ class="investor-logos__img"
+ >
+ </picture>
+ </div>
+
+ </li>
+ <li class="investor-logos__item">
+
+ <div class="investor-logos__container">
+ <picture class="investor-logos__picture">
+ <source srcset="/assets/images/investors/max.090f7458.svg"
+ type="image/svg+xml"
+ >
+ <source srcset="/assets/images/investors/max@2x.3215c512.webp 2x, /assets/images/investors/max@1x.8fabbf5a.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="/assets/images/investors/max@2x.d233b5b1.png 2x, /assets/images/investors/max@1x.5daaf9a0.png 1x"
+ type="image/png"
+ >
+ <img src="/assets/images/investors/max@1x.5daaf9a0.png"
+
+ alt="Max-Planck-Gesellschaft"
+ class="investor-logos__img"
+ >
+ </picture>
+ </div>
+
+ </li>
+ <li class="investor-logos__item">
+
+ <div class="investor-logos__container">
+ <picture class="investor-logos__picture">
+ <source srcset="/assets/images/investors/kaw.c1bb2e4b.svg"
+ type="image/svg+xml"
+ >
+ <source srcset="/assets/images/investors/kaw@2x.0afbcf57.webp 2x, /assets/images/investors/kaw@1x.04f3c517.webp 1x"
+ type="image/webp"
+ >
+ <source srcset="/assets/images/investors/kaw@2x.cc1a5adc.png 2x, /assets/images/investors/kaw@1x.318b49a9.png 1x"
+ type="image/png"
+ >
+ <img src="/assets/images/investors/kaw@1x.318b49a9.png"
+
+ alt="Knut and Alice Wallenberg Foundation"
+ class="investor-logos__img"
+ >
+ </picture>
+ </div>
+
+ </li>
+</ol>
+
+<footer class="site-footer">
+
+ <div class="site-footer__container">
+
+ <div class="grid-cell">
+
+ <nav class="footer-navigation">
+ <ul class="footer-navigation__list">
+ <li class="footer-navigation__list_item">
+ <a href="/about" class="footer-navigation__list_link">About</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/jobs" class="footer-navigation__list_link">Jobs</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/who-we-work-with" class="footer-navigation__list_link">Who we work with</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/alerts" class="footer-navigation__list_link">Alerts</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/contact" class="footer-navigation__list_link">Contact</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/terms" class="footer-navigation__list_link">Terms and conditions</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/privacy" class="footer-navigation__list_link">Privacy notice</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/inside-elife" class="footer-navigation__list_link">Inside eLife</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/archive/2019" class="footer-navigation__list_link">Monthly archive</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/labs" class="footer-navigation__list_link">Innovation</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/for-the-press" class="footer-navigation__list_link">For the press</a>
+ </li>
+ <li class="footer-navigation__list_item">
+ <a href="/resources" class="footer-navigation__list_link">Resources</a>
+ </li>
+ </ul>
+ </nav>
+
+ <div class="github-link-wrapper">
+ <a href="https://github.com/elifesciences" class="github-link">
+ <div class="github-link--text">Find us on GitHub</div>
+ </a>
+ </div>
+
+ </div>
+
+ <div class="grid-cell">
+
+ <div class="site-smallprint">
+ <small>eLife is a non-profit organisation inspired by research funders and led by scientists. Our mission is to help scientists accelerate discovery by operating a platform for research communication that encourages and recognises the most responsible behaviours in science.</small>
+ <small>eLife Sciences Publications, Ltd is a limited liability non-profit non-stock corporation incorporated in the State of Delaware, USA, with company number 5030732, and is registered in the UK with company number FC030576 and branch number BR015634 at the address:</small>
+
+ <address>
+ eLife Sciences Publications, Ltd<br>
+ Westbrook Centre, Milton Road<br>
+ Cambridge CB4 1YG<br>
+ UK
+ </address>
+ </div>
+
+ </div>
+
+ <div class="grid-cell">
+ <div class="site-smallprint site-smallprint__copyright">
+ <small>© <time>2019</time> eLife Sciences Publications Ltd. Subject to a <a href="https://creativecommons.org/licenses/by/4.0/" rel="license" class="site-smallprint__copyright_link">Creative Commons Attribution license</a>, except where otherwise noted. ISSN:&nbsp;2050-084X</small>
+ </div>
+ </div>
+
+ </div>
+
+</footer>
+
+
+ </div>
+
+ </div>
+ <link href="/assets/patterns/css/all.ad4007d5.css" rel="stylesheet">
+
+
+<script type="text/javascript">window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","licenseKey":"c53c018d69","applicationID":"29775807","transactionName":"NQQGNUZZWEACVhdZWQxOJQJAUVldTFQRRF8BDQE=","queueTime":0,"applicationTime":287,"atts":"GUMFQw5DS04=","errorBeacon":"bam.nr-data.net","agent":""}</script></body>
+
+</html>
diff --git a/python/tests/files/first_monday_ojs3_fulltext.html b/python/tests/files/first_monday_ojs3_fulltext.html
new file mode 100644
index 0000000..2248aed
--- /dev/null
+++ b/python/tests/files/first_monday_ojs3_fulltext.html
@@ -0,0 +1,441 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<title>Surveillance, stigma and sociotechnical design for HIV</title>
+</head>
+<body bgcolor="#ffffff" LINK="#bb7777" VLINK="#7777bb" ALINK="#ffee99" text="#000000">
+<blockquote><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71629" border="1" alt="First Monday" align="bottom"><br></blockquote>
+<hr>
+<blockquote>
+
+<center><a href="#author"><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71975" alt="Surveillance, stigma and sociotechnical design for HIV by Calvin Liang, Jevan Alexander Hutson, and Os Keyes" border="1"></a></center>
+
+<br><hr><br>
+
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71627" alt="Abstract"><br>Online dating and hookup platforms have fundamentally changed people&rsquo;s day-to-day practices of sex and love &mdash; but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms &ldquo;work&rdquo; for HIV frequently focus on user-to-user interactions and disclosure of one&rsquo;s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate.</p>
+
+<p><strong>Contents</strong></p>
+<p><a href="#p1">Introduction</a><br>
+<a href="#p2">Methods</a><br>
+<a href="#p3">Findings</a><br>
+<a href="#p4">Discussion</a><br>
+<a href="#p5">Conclusion</a></p>
+
+<p>&nbsp;</p><hr><p>&nbsp;</p>
+<p><strong><a name="p1"></a>Introduction</strong></p>
+
+<table width="70%" align="center"><tr><td>&ldquo;AIDS is essentially a crisis of governance, of what governments do and do not do, to and for their people &mdash; we have the drugs to treat HIV infection, we have the tools to confront the risks that drive HIV transmission and prevent infection itself &mdash; what we don&rsquo;t have is national political will necessary to scale-up our response. We have demanded too little from our leaders, excused far too much.&rdquo;<br>&mdash; Gregg Gonsalves, speech at the 2006 Toronto AIDS Conference.</td></tr></table>
+
+<table width="70%" align="center"><tr><td>&ldquo;Design is inherently about change &mdash; not just in the creation of new material artifacts, but in the ways that new technological objects afford new practices, social habits, and ways of living and interacting.&rdquo;<br>&mdash; Dombrowski, <em>et al.</em> (2016). &ldquo;Social justice-oriented interaction design: Outlining key design strategies and commitments.&rdquo;</td></tr></table>
+
+<p>Living and loving with HIV is a complicated task. HIV status and the stigma attached to it exists within a complex interplay of social norms and medicolegal infrastructures. The medicolegal history of HIV begins the moment that HIV and AIDS emerged, constituting a mix of medically justified legal norms and legally enforced medical requirements. The criminal justice and public health systems of modern states demarcated people living with HIV as a uniquely dangerous population, &ldquo;one that needed to be sought out, tracked down, tested, reported, listed, tagged, monitored, regulated, and, increasingly, criminalized&rdquo; <a name="1a"></a>[<a href="#1">1</a>].</p>
+
+<p>The immediate policy response in the United States imposed significant criminal and civil liability upon people living with HIV (Hoppe, 2018; Harsono, <em>et al.</em>, 2017; Sykes, <em>et al.</em>, 2016; Thrasher, 2015; Galletly, <em>et al.</em>, 2014; Lehman, <em>et al.</em>, 2014; Gagnon, 2012; Pollard, 2006; Gostin, <em>et al.</em>, 1999). Between 1986&ndash;2019, HIV-specific criminal laws and sentence enhancements applicable to people living with HIV have been enacted in 34 states and two U.S. territories (Center for HIV Law &amp; Policy, 2019; Lehman, <em>et al.</em>, 2014). Since 1986, these laws have criminalized nondisclosure of HIV and engagement in &ldquo;risky&rdquo; behaviors such as sexual activity, exposure to bodily fluids, needle sharing, sex work, blood/organ/semen donation, and, in a variety of instances, behaviors posing little, if any, risk of HIV transmission (Center for Disease Control and Prevention, 2019a; Center for HIV Law &amp; Policy, 2019).</p>
+
+<p>Despite claiming medical legitimacy for this punitive approach, researchers have long understood that the criminalization of HIV transmission was instead fueled by the associations between HIV and the gay community and communities of color (Hoppe, 2018; Gallo, 2006; Johnson, 1992; Banks, 1989) at a time when consensual sex between same-sex partners was a criminal offense in twenty-two states and over 61 percent of American evangelicals and 50 percent of non-evangelicals agreed with the statement &ldquo;I sometimes think AIDS is a punishment for the decline in moral standards&rdquo; (Gallup and Castelli, 1987).</p>
+
+<p>A significant body of empirical social science work documents the harmful effects HIV laws have had on the lives of people living with HIV (Barr&eacute;Sinoussi, <em>et al.</em>, 2018; Harsono, <em>et al.</em>, 2017; Sweeney, <em>et al.</em>, 2017; Adam, <em>et al.</em>, 2014). HIV criminalization both reinforces and magnifies HIV-related stigma and discrimination, reduces the willingness of persons at risk for HIV to get tested or seek care, and imperils demographic health collection of information (Harsono, <em>et al.</em>, 2017; Burris and Cameron, 2008; Galletly and Pinkerton, 2006; Elliot, 2002). A survey of over 2,000 people living with HIV in the U.S. revealed that at least 25 percent of respondents knew one or more individuals who were afraid to get tested for fear of facing criminalization (Sero Project, 2012). HIV criminalization also ignores the reality that successful antiretroviral therapy can render the level of the virus to undetectable, which, according to the National Institute of Health, means that HIV is then untransmittable (Eisinger, <em>et al.</em>, 2019).</p>
+
+<p>While HIV transmission was criminalized, other tools of control &mdash; in the form of surveillance &mdash; arose and were enforced. Early policy responses to HIV centered on overt surveillance and ostracism of those infected and perceived to be at risk (Fortin, 1995). This surveillance generally consists of disease reporting, sexual contact tracing, and data collection of people who have been diagnosed with HIV (Fan, 2012; 2011; Ward and Bell, 2014; Ward, 2005). The Center for Disease Control, for example, collects HIV data based on confidential name-based reporting laws implemented in all 50 states as of April 2008 (Center for Disease Control and Prevention, 2019b).</p>
+
+<p>HIV surveillance (and sexually transmitted infection surveillance more broadly) centralizes information and power in the state (Fairchild, <em>et al.</em>, 2007; Fan, 2012); because HIV intervention and surveillance is generally concentrated in lower income communities and health settings (McCree and Hogben, 2010), the most socially and economically marginalized communities bear the heaviest burden of HIV surveillance and its downstream consequences (Miller, <em>et al.</em>, 2004; Banks, 1989; Brandt, 1987). There is a long-racialized history of HIV, one that, in combination with the background racism of the United States, has led to the systemic undertreatment and under-consideration of communities of color (Ford, <em>et al.</em>, 2007; Anonymous, 2000; Johnson, 1992).</p>
+
+<p>This infrastructure of surveillance in turn reinforces the stigma of HIV, which has dramatic consequences for the likelihood of unwanted disclosure, access to care, psychiatric well-being, housing and employment discrimination, and, consequently, quality (or probability) of life (Lazarus, <em>et al.</em>, 2016; Mahajan, <em>et al.</em>, 2008). Coupled with the overarching stigma of HIV and its criminalization in various contexts, HIV surveillance offers a tool through which the state can identify citizens to be punished.</p>
+
+<p>In the era of &ldquo;big data&rdquo; and ubiquitous surveillance capitalism (Zuboff, 2019) &mdash; the private monetization of information about reality &mdash; HIV surveillance is not just in the hands of the state, but also in the hands of private organizations and individuals. In the context of widespread state surveillance and control and ongoing stigmatization of HIV, this opens yet more possibilities for harm through enabling the selling and redistribution of HIV status information, without the user&rsquo;s meaningful consent, to parties who may themselves engage in discrimination or direct violence.</p>
+
+<p>Many online platforms &mdash; including, as we trace out below, dating platforms &mdash; constitute not just spaces for the purposes outlined in their marketing materials but also tools for the police in tracing HIV status and criminalized behavior. In recent years, police have used technology to conduct Internet-based investigations for a similar purpose (POZ, 2015). Police now go undercover on Web sites and dating apps by creating fake identities online (Semitsu, 2011), and local law enforcement agencies and federal agencies increasingly employ these tactics in online investigations (Lichtblau and Arkin, 2014).</p>
+
+<p>Legal and public health scholars and advocates continue to call for a paradigm shift in managing HIV that leaves behind historical responses like surveillance, ostracism, and incarceration and accounts for the rise of the Internet and mobile technology and their impact on sexual attitudes and behaviors (Lehman, <em>et al.</em>, 2014; McCallum, 2014; Fan, 2011; Fenton, 2010). Since the criminalization of HIV, intimate platforms have become vital structures through which millions of people access the opportunity to engage in reciprocal romantic and sexual relationships (Hutson, <em>et al.</em>, 2018; Taylor, <em>et al.</em>, 2017; Rosenfeld and Thomas, 2012). By designing infrastructures for intimate affiliation, intimate platforms wield unmatched structural power to shape who meets whom and how within dating and sexual platforms (Hutson, <em>et al.</em>, 2018; Levy and Barocas, 2018; Emens, 2008; Robinson, 2007). These platforms frame the circumstances within which users understand each other as prospective romantic or sexual partners and shape social norms, sexual scripts, and relative advantages among users (Hardy and Lindtner, 2017; Kannabiran, <em>et al.</em>, 2012).</p>
+
+<p>The design of intimate platforms provides opportunities to explore new ways of managing HIV that reduce the concentration of power and information in the state (Fan, 2012). Through the role that platform design plays in shaping cultural norms, which has been identified as a more effective way of achieving HIV transmission prevention than flexing the punitive and surveillant arms of the state (Sunstein, 1996), intimate platform design provides opportunities to explore new ways of managing HIV (Fan, 2012). Indeed, a meta-analysis of HIV prevention efforts found that strategies that intervene in social meaning by shaping social norms, cultural practices, and individual attitudes were more effective in empowering behavioral change than appeals to fear (Albarracin, <em>et al.</em>, 2015).</p>
+
+<p>However, designing intimate platforms to account for HIV also presents serious challenges for social computing researchers and human-computer interaction (HCI) designers. As Handel and Shklovski pointed out: &ldquo;The minutiae of design decisions around profile options deserves particular attention because even the smallest changes can result in substantial differences for user interactions&rdquo; (Handel and Shklovski, 2012). In addition to concerns around how to best design for HIV, platforms, Grindr in particular, have already come under fire for sharing user HIV information with third parties (Singer, 2018). Moreover, designing intimate platforms to unburden the risks of extant criminal and civil sexual regulations runs the serious risk of re-entrenching the status quo and its incumbent inequalities and power relations (Bardzell, 2010). While designing for HIV presents opportunities to redress stigma and harm, researchers in HCI must understand that &ldquo;[i]t is not enough to have good intentions ... [we] must ground [our] efforts in clear political commitments and rigorous evaluations of the likely consequences&rdquo; (Green, 2018).</p>
+
+<p>From this comes the recognition that social computing designers and researchers seeking to design for disclosure cannot afford to ignore the ways that the lived experiences of people living with HIV are shaped by structural forces and, particularly, the reality of HIV criminalization and the State&rsquo;s role in conducting STD surveillance. Platforms, after all, do not exist in a separate sphere from material reality: a redesign that eases HIV disclosure from user-to-user might also involve the storing of disclosure data by the platform &mdash; data that can then be accessed, requisitioned, and co-opted by arms of the state. In line with Jackson, <em>et al.&rsquo;s</em> call for the social computing community to address the structural and lived consequences of law and policy that &ldquo;establish the very terrain on which design and practice can be conceived, articulated, and imagined &mdash; and upon which battles of accountability are inevitably waged&rdquo; <a name="2a"></a>[<a href="#2">2</a>], we wish to undertake a critical investigation of HIV disclosure in dating and hookup platforms. This involves not just investigating the implications of disclosure in a person-to-person sense, but also how platform design is shaped by legal and administrative regulation and how the risks of disclosure might open users up to systems of surveillance, stigma, and criminalization. We do so by using a range of platforms in an effort to gain a wide view, and to practice prefigurative politics &mdash; minimizing our assumptions about the &ldquo;type&rdquo; of people at risk of HIV infection and/or surveillance.</p>
+
+<p>To do this, we analyze platform&rsquo;s consequences for HIV through the lens of user-to-user interactions, exploring the ways that design renders users visible and vulnerable to wider carceral and surveillance infrastructures, and the way that design shapes (and is shaped) by HIV&rsquo;s legal status. We ground our discussion in a content analysis of 50 popular, mobile dating and hookup platforms, coding for design and policy choices related to HIV disclosure, prevention, destigmatization, surveillance, privacy, and criminalization. Through this, we reveal that many platforms fail to account for HIV, and of those that do, many neglect to attend to the downstream consequences of HIV disclosure and the data produced by it, while exacerbating the social, racial, and class stereotypes associated with the condition.</p>
+
+<p>As scholars and designers consider how platform design might aid HIV prevention and destigmatization (Hutson, <em>et al.</em>, 2018; Albury, <em>et al.</em>, 2017; Wohlfeiler, <em>et al.</em>, 2013; Rosser, <em>et al.</em>, 2011), we aim to grapple with the structural and ethical implications of designing for HIV, particularly how intimate platform design might aid and abet the decriminalization and surveillance of HIV (Sykes, <em>et al.</em>, 2016; Kazatchkine, <em>et al.</em>, 2015; Perone, 2013; Gagnon, 2012; J&uuml;rgens, <em>et al.</em>, 2009). Drawing on principles from social justice-oriented design to investigate controversies and design possibilities in intimate platforms, we attempt to articulate an approach to intimate platform design that not only works to reduce the stigma of user disclosure, but also works to contest historic and present power imbalances and injustices between users, platforms, and the state.</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p2"></a>Methods</strong></p>
+
+<p>Using a directed content analysis (Hsieh and Shannon, 2005), we reviewed 50 existing mobile dating and hookup platforms. Content analyses have proven effective in understanding platform design and governance and the ways design practices mediate user-to-user bias and discrimination (Levy and Barocas, 2018; Hutson, <em>et al.</em>, 2018). We set out to capture a landscape of popular platforms and selected the first 50 dating and hook up platforms in the top 200 grossing social networking applications in the United States on the iOS App Store in March of 2019. <a href="#fig1">Figure 1</a> lists the platforms selected in alphabetical order.</p>
+
+<p>&nbsp;</p>
+<a name="fig1"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71623" alt="50 dating and hookup platforms surveyed"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 1:</strong> The 50 dating and hookup platforms surveyed.</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p>Utilizing the walkthrough method (Light, <em>et al.</em>, 2018), we explored each platform&rsquo;s HIV-related user experience. We examined design features on each of these platforms, systematically documenting design choices, policies, and informational interventions that mediate HIV. Building upon previous work around intimate platforms and HIV, we coded each of the 50 intimate platforms based on the following dimensions:</p>
+
+<table width="70%" align="center"><tr><td><p>Prevention</p>
+<ul><li>Whether the app allows same-sex connections</li>
+<li>Whether a user can disclose HIV/sexually transmitted infection (STI) status (Warner, <em>et al.</em>, 2018)</li>
+<li>If they can disclose, what are the options? (Warner, <em>et al.</em>, 2018)</li>
+<li>Whether a user can search for or filter out users with HIV/STIs? (Hutson, <em>et al.</em>, 2018)</li>
+<li>Whether the platforms provide informational interventions with respect to HIV/STI prevention (Wang, <em>et al.</em>, 2019)</li></ul>
+<p>Stigma reduction</p>
+<ul><li>Whether a user can identify as having HIV/STI (<em>e.g.</em>, &ldquo;Poz&rdquo;, etc.)</li>
+<li>Whether a user can indicate interest in or acceptance of people living with HIV/STIs (<em>e.g.</em> outward presentation, separate from filtering, not simply via profile text) (Hutson, <em>et al.</em>, 2018)</li></ul>
+<p>Policies</p>
+<ul><li>Whether the platform engages HIV/STIs in their policies (terms of service, privacy, and community policies, etc.) (Jackson, <em>et al.</em>, 2014)</li></ul></td></tr></table>
+
+<p>For ethical reasons, we did not interact with other users, only observed features, and deleted our accounts once data were collected when possible (not all platforms allowed for account deletion). The design and policy choices described and discussed below are not intended as an endorsement of any particular design intervention for managing HIV. Rather, we aim to capture the various ways intimate platforms currently manage and mediate HIV among users and how those choices map onto extant legal and surveillant infrastructures. Additionally, we highlight two limitations in how we chose which platforms to analyze. First, it is possible for a hook-up platform to not have an accompanying mobile app, meaning our selection of platforms from the iOS app store will have invariably missed Web site-based platforms. Second, we may have overlooked platforms that are more niche or community-specific, yet not as popular in the broader platform marketplace (<em>i.e.</em>, not within the top grossing platforms).</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p3"></a>Findings</strong></p>
+
+<p>&nbsp;</p>
+<a name="fig2"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71624" alt="A visualization of our content analysis"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 2:</strong> A visualization of our content analysis.</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p><em><strong>Design features</strong></em></p>
+
+<p>Out of the 50 intimate platforms we examined, 13 were meant specifically for queer communities (11 specifically targeted at gay and bisexual men and two at lesbian and bisexual women). None of the platforms we reviewed were distinctly designed for trans people. The remaining 34 platforms were for general audiences, catering to heterosexual and homosexual connections, and three platforms were exclusively for heterosexual connections (eHarmony, Uniform Dating, and Waplog). Only queer-specific platforms (six) had explicit HIV disclosure options and allowed for filtering or searching based on HIV status. <a href="#fig3">Figure 3</a> shows the disclosure options for each platform. Growlr, Taimi, and Scruff allowed users to indicate that they were accepting of people living with HIV. Grindr, Hornet, Mr. X, Xtremboy, and Scruff, five platforms all of which are queer-specific, provide informational interventions with respect to HIV/STI prevention (See <a href="#fig4">Figure 4</a> for examples). Eight dating apps mentioned HIV in their policies (five queer-specific, three general). Four dating apps allowed users to identify with an HIV/STI-relevant identity category, often labeled &ldquo;poz&rdquo;. Please see <a href="#fig2">Figure 2</a> for a visualization of our content analysis.</p>
+
+<p>&nbsp;</p>
+<a name="fig3"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71625" alt="Disclosure options"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 3:</strong> Disclosure options.</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p>&nbsp;</p>
+<a name="fig4"></a>
+<table align="center" width="60%" cellpadding="4">
+<tr align="center"><td><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71626" alt="Examples of HIV/STI prevention features on Grindr (left, middle) and Hornet (right)"></td></tr>
+<tr><td>&nbsp;</td></tr>
+<tr align="center"><td><strong>Figure 4:</strong> Examples of HIV/STI prevention features on Grindr (left, middle) and Hornet (right).</td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+<p>&nbsp;</p>
+
+<p><em><strong>Policies</strong></em></p>
+
+<p>None of the 50 intimate platforms we reviewed explicitly mention HIV in their terms of service. Four platforms expressly discuss HIV in their privacy policies (Grindr, Hornet, Scruff, and Mr. X), and four platforms mention HIV in platform safety policies (Planet Romeo, Tinder, BlackPeopleMeet, and Our Time). No platform engaged any of the legal implications of HIV. No platform engaged the public health surveillance of HIV.</p>
+
+<p>Of the four platforms that expressly engage HIV in their privacy policies (Grindr, Hornet, Mr. X, Scruff), only two (Grindr &amp; Hornet) explicitly prohibit sharing HIV information with third parties. By disclosing one&rsquo;s HIV status on Mr. X and Scruff, users consent to the platform&rsquo;s processing of that information. Grindr warns that HIV status disclosure on a user profile is effectively public information, however the platform does not share HIV status information with third party tracking, analytics, and advertising companies or service providers. Of all the platforms reviewed, Grindr&rsquo;s privacy policy is the only one that devotes an entire section to HIV status, which is not particularly surprising given Grindr&rsquo;s involvement in multiple controversies around sharing HIV information with third parties (Fitzsimons, 2019; Singer, 2018):</p>
+
+<table width="70%" align="center"><tr><td>&ldquo;HIV Status. At the recommendation of HIV prevention experts and the community of Grindr users, we give you the option of publishing your health characteristics, such as your HIV status, in your Grindr community profile. Remember that if you choose to include information in your profile, that information will become public to other users of the Grindr App. As a result, you should carefully consider whether you want to disclose your HIV status. We do not share HIV status with any third-party service advertisers or third-party service providers other than companies that host data on our behalf (<em>e.g.</em>, Amazon Cloud). In addition, we do not use HIV status for advertising purposes and do not share this information with advertisers.&rdquo;</td></tr></table>
+
+<p> According to Hornet&rsquo;s privacy policies, they &ldquo;[do] not share any HIV status information with third parties unless required to do so by law&rdquo;. Of the 50 platforms reviewed, Hornet was the only one to enable users to opt into receiving &ldquo;in-app reminders to undergo HIV tests and receive information on the location of nearby testing centers.&rdquo; On Hornet, a user&rsquo;s HIV status &ldquo;is only searchable by users who have defined themselves as HIV positive.&rdquo; Scruff&rsquo;s privacy policy highlights that &ldquo;there is no requirement to&rdquo; provide them with &ldquo;health details and whether part of the POZ (HIV positive) community (for example, in creating or updating your profile),&rdquo; and that by doing so, users &ldquo;are explicitly consenting to [Scruff&rsquo;s] processing of [their] information.&rdquo; Mr. X&rsquo;s privacy policy notes that HIV status information &ldquo;may be considered &lsquo;special&rsquo; or &lsquo;sensitive&rsquo; in certain jurisdictions,&rdquo; and that by providing this information, users &ldquo;consent to [Mr. X&rsquo;s] processing that information&rdquo;.</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p4"></a>Discussion</strong></p>
+
+<p><em><strong>Prevention</strong></em></p>
+
+<p>Platforms can act as an interventional tool to improve access to and perceptions of care for people living with HIV. Examples of HIV/STI prevention include a &ldquo;Last Tested Date&rdquo; section on a user&rsquo;s profile and reminders to get tested for HIV/STIs. Some current platforms engage with HIV more critically by acknowledging that HIV is an issue its users should be aware through specific features. Hornet, for instance, provides its users with HIV-relevant educational material and resources for getting tested. Hornet also limits searching based on HIV status to people who themselves have chosen the HIV positive option, thereby limiting the possibility of HIV status-based discrimination. Hornet and Grindr can also provide reminders for users to get tested. Scruff allows users to choose from sex safety practices that include using condoms, using pre-exposure prophylaxis (PrEP), and/or treatment as prevention (Warner, <em>et al.</em>, 2019).</p>
+
+<p>Due in large part to the history of HIV&rsquo;s recognition as a medical condition, HIV has been generally classified as a &ldquo;gay man&rsquo;s problem&rdquo; in North America &mdash; frequently (albeit almost as frequently unmarked) a white, cisgender gay man&rsquo;s problem. This classification and framing acted to both separate normative society from the stigma associated with the condition and provide an avenue for activism by associating it with the most &ldquo;acceptable&rdquo; queer bodies: masculine, middle class, cisgender and white (Epstein, 1996).</p>
+
+<p>HIV has disproportionately impacted gay communities specifically, but transmission does not fit a neat pattern of being binarized tidily along sexuality. It is disproportionately prevalent in communities of color, appears in heterosexual relationships and lives, and risk of transmission follows societal vulnerability and marginalization &mdash; transgender women, particularly transgender women of color, are particularly overrepresented in diagnosis rates (Clark, <em>et al.</em>, 2017). While the partial normalization of HIV &mdash; holding it outside the direct concerns of white, cisgender, heterosexual people, but embodying it in people who look &ldquo;just like them&rdquo; &mdash; may have aided in assembling efforts to address the condition, the assumptions that it has created in who is at risk and who &ldquo;counts&rdquo; have been tremendous. One only has to look at the ethnographic work of Vivianne Namaste, who highlights how Montreal&rsquo;s history of HIV, its recognition, and efforts at its prevention simultaneously elided the incidence rate amongst the Haitian community (which at one point had 65 percent of reported AIDS cases) and lacked any advice or conception of susceptibility for women, particularly heterosexual or bisexual women (Namaste, 2015).</p>
+
+<p>Our platform analysis demonstrates that these same assumptions about vulnerability and risk are present in the design of intimate platforms. Generic platforms (<em>i.e.</em>, those that cater to non-queer or broader, more heteronormative audiences) entirely do not consider, engage, or design for HIV while the platforms for queer &mdash; and more specifically gay men &mdash; do. Even within the group of 13 queer-specific applications, neither of the two queer women-specific apps allowed for HIV disclosure, even though 23 percent of people with HIV in the U.S. are women (Center for Disease Control and Prevention, 2019c). Most, if not all, platforms dedicated to general audiences do nothing when it comes to HIV prevention, contributing to the knowledge gap for general audiences on sexual health, HIV-specific, and more. Because general audiences can go through online dating experiences without encountering HIV materials, platform designers allow these users to falsely believe that their sexual lives are excluded from important matters of sexual health.</p>
+
+<p>Our intent is not to suggest that HIV should be narrated as a problem for everyone; to ignore sexuality in the impact and risk of HIV transmission is an ahistorical mistake. But treating it <em>solely</em> as a &ldquo;gay man&rsquo;s problem&rdquo; simultaneously elides differences in vulnerability and risk within gay communities and perpetuates the silence around transmission for other populations, particularly trans women of color and/or heterosexual people. In other words, it is not that HIV is not frequently a risk for gay communities, but that drawing a line between sexuality and risk perpetuates the more nuanced disparities in risk and the discourse that HIV transmission is not something anyone else has to think about.</p>
+
+<p>Platforms can and have implemented prevention efforts through Last Tested Date and Testing Reminders features. Doing so more ubiquitously, rather than solely on gay male-specific platforms, may be helpful in normalizing prevention efforts like getting tested regularly and knowing one&rsquo;s status. Through opportunities like this, platform designers have the opportunity to promote HIV/STI prevention and care &mdash; an opportunity that is valuable precisely for its ability to normalize prevention efforts. This is not to say that such features are not without risks, particularly with regards to state surveillance, intervention and structural forces, which is our next topic of concern and discussion.</p>
+
+<p><em><strong>Stigma &amp; disclosure</strong></em></p>
+
+<p>Designing for HIV is not as simple as including disclosure fields and status-based filtering or not. Allowing disclosure and filtering can protect people living with HIV from negative and sometimes harmful interactions, help filter out people who might discriminate against them, fight HIV stigma, and promote much-needed awareness. However, disclosure and filtering can also lead to discriminatory practices (Hutson, <em>et al.</em>, 2018), have potential for privacy unraveling (Warner, <em>et al.</em>, 2018), and contribute to surveillance (Fan, 2012, 2011).</p>
+
+<p>De-stigmatizing HIV offers designers an opportunity to engage in the structural dimensions of how HIV operates in social life and can possibly allow us to better tap into social norms around the condition that ultimately improve other outcomes. For instance, humanizing people living with HIV could lead to more people getting tested, being open about their status, and being communicative with their sexual partners. Platforms have the power to shift social norms and destigmatize HIV at scale due to their pervasiveness throughout modern connections, but designers need to contest the ethical implications of de-stigmatizing HIV on these platforms, especially through current features such as HIV-status-based filtering and disclosure options.</p>
+
+<p>Filtering and searching tools based on HIV status can be instrumental for people living with HIV to find others who are either seropositive or otherwise accepting of seropositive people. Additionally, filtering out those who might discriminate against them for their HIV status anyways allows people living with HIV to avoid awkward or even violent interactions with users who harbor problematic beliefs about people living with HIV. Conversely, HIV status-based filtering and searching tools have representational and allocational harms. First, it represents that there are particularly psycho-social characteristics incumbent with HIV status. These stereotypes play out in a variety of different ways such as the framing that people living with HIV engage in &ldquo;risky&rdquo; sexual behavior. Second, HIV status-based filtering can be used to structurally exclude HIV positive users from the opportunity to engage in intimate affiliation (Hutson, <em>et al.</em>, 2018). Platforms can and do provide users the ability to screen out other users who identify as &ldquo;Poz&rdquo; or disclose their HIV status. Not only do these design features facilitate exclusion, they may disincentivize HIV related disclosures to the extent that such disclosures can be weaponized by other users to exclude them as potential intimate affiliates.</p>
+
+<p>Disclosure fields as a way to de-stigmatize HIV are similarly complicated in that they can inhibit and benefit people living with HIV. For one, encouraging users to disclose, regardless of their status, can create a healthier culture and discussion around HIV, possibly making talking about one&rsquo;s status an acceptable and common practice of intimate engagement. On the other hand, disclosure can be used for a variety of problematic ends that harm seropositive users. Other users may discriminate against users who have disclosed their HIV status, choosing to ignore or disengage with them entirely. Disclosure may have unintended consequences and lead to more personal and violent outcomes. Due to laws in particular jurisdictions, failure to disclose one&rsquo;s status to a partner can lead to prosecution and potentially incarceration. People living with HIV might also face physical and emotional threats for disclosing their status either publicly or privately.</p>
+
+<p>Due to these complexities, designers of dating platforms must face the question of how can we de-stigmatize HIV without creating additional obstacles for people living with HIV? Platforms need to critically unpack the possible consequences of well-intentioned design choices, including HIV status-based filtering and HIV status disclosure fields. Of the platforms we reviewed, Scruff is the only one to provide for HIV disclosure without using an express &ldquo;HIV status&rdquo; field, allowing instead two disclosure options, Poz and Treatment as Prevention. &ldquo;Poz&rdquo; constitutes an association and identification with a community (<em>e.g.</em>, &ldquo;I am a bear, daddy, poz&rdquo;), while &ldquo;Treatment as Prevention,&rdquo; signals antiretroviral therapy (<em>i.e.</em>, use of HIV medicines to treat HIV infection) and constitutes a link to sex safety practices.</p>
+
+<p><em><strong>Surveillance &amp; criminalization</strong></em></p>
+
+<p>At the same time, given the questions of structural power and surveillance built into these platforms, we are leery of treating disclosure option design as the site of de-stigmatization and justice. Questions of privacy and stigma go wider than micro-interactions and touch on how HIV is seen and responded to societally and administratively. The dominant responses to HIV/AIDS &ldquo;center on adjusting the traditional levers of criminal and tort law, and of public health law, with its surveillance and disciplinary regimes that concentrate information and decision-making in the state&rdquo; <a name="3a"></a>[<a href="#3">3</a>]. Indeed, HIV continues to function as a &ldquo;vector for the exercise of state power and the invention of novel logics and techniques of government,&rdquo; whereby &ldquo;[i]nfection with HIV virtually guarantees that a citizen will need to interact, either beneficently or coercively, with one or more state bureaucracies&rdquo; <a name="4a"></a>[<a href="#4">4</a>].</p>
+
+<p>The broader ecosystem of intimate platforms that we observed provided virtually no HIV-specific privacy information or protections for users living with HIV. Overall, both the platforms that account for HIV in their privacy policies and the platforms that enable disclosure but do not account for HIV in their privacy policies continue to place the risks and burden of surveillance, privacy, and disclosure on users with HIV. Grindr&rsquo;s &ldquo;HIV Status&rdquo; policy puts it clearly: &ldquo;Remember that if you choose to include information in your profile, that information will become public to other users of the Grindr App.&rdquo; By surfacing this as a risk we do not mean to suggest that users lack agency &mdash; merely that the agency to choose between a range of options can be influenced by how those options are bounded and made available in addition to the affordances and norms that platform design provides. That a user makes information public does not mean that &ldquo;consumable by all&rdquo; is the framework of disclosure that they have in mind (Wittkower, 2016).</p>
+
+<p>While some intimate platforms are working towards promoting HIV disclosure, prevention, and de-stigmatization, they are also failing to grapple with privacy implications of HIV and their responsibility in ensuring it. People living with HIV are already vulnerable and bear the weight of HIV disclosure&rsquo;s downstream consequences. By continuing to offload the burdens and risk on those with HIV, platforms are likely contributing to issues of nondisclosure as well as HIV testing. Research shows that privacy fears can result in the non-disclosure of HIV status information within close personal relationships (Derlega, <em>et al.</em>, 2004; Zea, <em>et al.</em>, 2003; Derlega, <em>et al.</em>, 2002).</p>
+
+<p>In this context, proposals to design for HIV disclosure that do not consider the wider structural implications of surveillance are concerning. The focus of most research into HIV and online dating in HCI on micro-interactions and enabling trust and certainty between users elides the implications that providing this data to a platform outside user control has and the way that this data can be used to control. This is not an abstract risk; just this year, Grindr (the platform under study) has been the subject of scrutiny by the U.S. government over its Chinese ownership, due to fears that the Chinese government might access and copy Grindr&rsquo;s data around HIV disclosure for the purpose of domestic policing and control (Fitzsimons, 2019). If we are designing to enable HIV disclosure, are we working to improve stigma associated with disclosure &mdash; or are we enabling new forms of control and surveillance?</p>
+
+<p>In the United States today, intimate platforms operate within 29 states that have HIV criminal laws, which include laws that target sex/nondisclosure of HIV-positive status, sex work, exposure to bodily fluids, needle-sharing, sex work, and blood/organ/semen donation, nine states that have sentencing enhancements applicable to people living with HIV who commit an underlying assault crime, and 24 states that have prosecuted people living with HIV under non-HIV-specific general criminal laws (Center for HIV Law &amp; Policy, 2019). Here, the design of intimate platforms cannot be removed from the reality of laws that criminalize HIV, particularly HIV non-disclosure.</p>
+
+<p>People living with HIV in the U.S. with HIV-specific criminal laws must disclose their HIV status to sexual partners. Generally, &ldquo;disclosure and consent&rdquo; is an affirmative defense <a name="5a"></a>[<a href="#5">5</a>], whereby a person can avoid criminal and civil liability if they disclose their serostatus <a name="6a"></a>[<a href="#6">6</a>] and their sexual partner voluntarily consents to sexual activity with knowledge of that serostatus <a name="7a"></a>[<a href="#7">7</a>]. Many of the laws that criminalize HIV non-disclosure do not provide guidance as to what methods of disclosure and consent are enough to avoid prosecution and conviction (McCallum, 2014). No court or legislature has affirmatively stated whether verbal disclosure and consent are necessary under criminal HIV transmission statutes. Furthermore, non-verbal communication online create uncertainty as to whether there is sufficient disclosure and consent to remove criminal liability for HIV-positive individuals. Both disclosure and consent can be ambiguous or misunderstood, a problem that is complicated by the design and widespread use of mobile dating and hookup platforms.</p>
+
+<p>It remains unclear what constitutes appropriate disclosure and informed consent in the context of intimate platforms, such as HIV disclosure fields on user profiles or other communication in a profile&rsquo;s free form text sections (<em>e.g.</em>, &ldquo;+&rdquo; &ldquo;Poz&rdquo;, &ldquo;undetectable&rdquo;). Although some intimate platforms afford HIV-positive users the ability to disclose their serostatus in new ways, no court or legislature in the U.S. has answered whether disclosing HIV status on an intimate platform is enough to achieve informed consent and avoid criminal and civil liability. Yet many people living with HIV also use records of conversations on intimate platforms as a means of protection. For example, people disclose their status and use that record as a way to protect themselves from future allegations of non-disclosure. This ambiguity and incumbent legal risk places significant responsibility and pressure on HIV users. Research shows that fears around rejection, self-blame, criminalization, and privacy can result in the non-disclosure of HIV status information within close personal relationships (Derlega, <em>et al.</em>, 2004; Zea, <em>et al.</em>, 2003; Derlega, <em>et al.</em>, 2002). Privacy concerns around HIV disclosure are often associated with the need to protect one&rsquo;s self from HIV related stigma (Adam, <em>et al.</em>, 2011; Serovich and Mosack, 2006; Greene, <em>et al.</em>, 2003). As more and more people use platforms to meet intimate partners, the historical failure of HIV criminalization law to understand how disclosure and consent are negotiated in practice becomes all the more apparent.</p>
+
+<p>It might seem from this that designers and developers are trapped in an impossible situation &mdash; disclosure to protect users simultaneously produces the possibility of structural harms for those disclosing. While we urge designers to take both needs seriously, we do not consider it impossible; in fact, there is a range of work within queer theory and technology that not only articulates this tension of privacy, disclosure and the reuse of data but suggests queer forms of resistance to it. Writing more broadly, Brian Schram highlights the way that the increasing possibilities of &ldquo;big data&rdquo; and its attendant surveillance structures &ldquo;constitute an undoing of Queerness as a radical political injection&rdquo; <a name="8a"></a>[<a href="#8">8</a>], advocating a politics of <em>melancholia</em> that features a haunting of archives: an insertion of the dead weight of our collective memory as Queer persons into the growing catalog of our digital information. In other words, Schram suggests the deliberate incorporation of masses of false data, profiles, and traces into data stores in order to render ambiguous the truth of any presence and provide cover for those queer persons existing within the platform(s) data highlights. What would this look like in the case of dating platforms? What are the possibilities raised by incorporating a deluge of false accounts, <em>doppelg&auml;ngers</em>, and doubles, not as a deception of the platform or its users, but against state forces examining the database?</p>
+
+<p>More broadly, we might see possibilities for the future through practices in the past. In how queer communities responded to HIV disclosure and protection protocols during the 1980s and 1990s, David Halperin has articulated the way that gay communities worked to articulate norms that balanced risks, trust, and vulnerability in the absence of structural norms, that &ldquo;it is gay men themselves who have continued to define, and to redefine, the limits of safety through an ongoing history of sexual experimentation and mutual consultation, and who have thereby produced, over time, workable compromises and pragmatic solutions that balance safety and risk&rdquo; <a name="9a"></a>[<a href="#9">9</a>]. Rather than taking universalized, top-down approaches to platform design for all, we might instead seek to work up and to create a diverse range of spaces that challenge the ease of surveillance built into large-scale platforms and afford individual users more agency in establishing those compromises and solutions and engaging in that consultation.</p>
+
+<p>&nbsp;</p>
+<p><img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71630" alt="++++++++++"></p>
+<p><strong><a name="p5"></a>Conclusion</strong></p>
+
+<p>As HCI researchers and designers, we continue to push the boundaries of what is technologically possible but doing so requires us to first ask whether platform design is even an appropriate intervention in a given situation (Keyes, <em>et al.</em>, 2019; Baumer and Silberman, 2011; Suchman, 2011). The current model of platform design for HIV cannot continue, as it is too closely tied to the collection and commodification of highly sensitive personal data. However, reimagining intimate platform design provides the social computing community an opportunity to intervene in the social norms around HIV and HIV disclosure in a manner that could unburden the weight of criminalization without centralizing the surveillant arms of the state.</p>
+
+<p>We envision a future of dating platforms that does not force people living with HIV to sacrifice surveillance for intimate experiences. Because of their entanglements with sex and romance, intimate platforms need to take on more responsibility in the sexual health and data privacy of their users. Drawing from our analysis and our own lived experiences, we recommend platform-level changes, changes in platform, and mechanisms to prevent platforms from knowing their users&rsquo; statuses. First, platforms should make explicit to their users the consequences of storing sensitive, personal information like HIV status and their documentation processes. Next, they should also implement policies that manage how data are stored when users delete their accounts and protect these data from third-party consumers. Finally, ownership of user&rsquo;s data should belong to the users themselves, rather than the platforms. Users should be able to pass along their information to other users without the platforms tracking it.</p>
+
+<p>HIV is a medical condition, but its eradication requires not just technical, or even sociotechnical, but socio<em>political</em> solutions. Indeed, the ways in which designers and policy-makers frame HIV is an inherently political decision, one that will impose the contours and boundaries of our response. The social computing community cannot do nothing, but it also must resist the desire to do everything. Designing user interfaces and platform policies to account for HIV will require a rigorous analysis of possible outcomes and consequences as well as a bedrock commitment to centering the voices and experiences of those impacted by HIV and the state&rsquo;s responses to it. Our commitments must account for the ways pathology and power intertwine to subjugate and otherize impacted communities at home and abroad.</p>
+
+<p>Designing intimate platforms to unburden the risks of extant criminal and civil sexual regulations runs the risk of re-entrenching the status quo and its incumbent inequalities and power relations (Dombrowski, <em>et al.</em>, 2016; Light, 2011; Irani, <em>et al.</em>, 2010; Bardzell, 2010). The social computing community must ground its efforts to design for HIV in clear political commitments to decriminalizing HIV and decentralizing power and information from the state. We must strive to unburden the weight of surveillance and incarceration on vulnerable and marginalized communities and work towards offloading the significant social and legal risks and pressures for people living with HIV. Moreover, our commitment to designing for HIV must not exclude nor obfuscate our capacity for direct action within and outside of the realms of design and research. This means fighting for the rights, dignity, and safety of people living with HIV in the streets and in the halls of local, national, and international political, legislative, and executive bodies.</p>
+
+<p>Our instinctual response to the failed and violent efforts of HIV criminalization and surveillance should not be &ldquo;there&rsquo;s an app for that,&rdquo; but rather &ldquo;there&rsquo;s a zap for that!&rdquo;. That is, the practice of designing for people with HIV should be a &ldquo;critical technical practice&rdquo; (Agre, 1997), undertaken with a mindset that sits uneasily between and is cognizant of both individual and structural power and consequence. Pioneered by the American gay liberation movement, a zap or &ldquo;zap action&rdquo; is a political action of direct and persistent public confrontation. Whether shouting down public figures or smashing pies into the faces of evangelicals, zaps aim to disrupt and disturb persons and institutions of authority to effect change (Cohen, 2018). In the words of AIDS Coalition to Unleash Power&rsquo;s (ACT UP) &ldquo;New Member Packet&rdquo;:</p>
+
+<table width="70%" align="center"><tr><td>&ldquo;Zaps are a method for ACT UP members to register their disapproval of and anger toward the zap target. Zaps usually have more specific targets than actions. Because of this focus, numerous zapping techniques have been developed. ACT UP zaps individuals or organizations by: sending postcards or letters; invading offices and distributing fact sheets; sending (lots and lots of) faxes; picketing; outraged (and sometimes outrageous) phone calls. The more zappers who zap the zappee the better the zap.&rdquo;</td></tr></table>
+
+<p>A critical approach to designing for HIV requires the contesting of histories of incarceration, stigmatization, and surveillance and the ways in which the state exerts power and domination through its medicolegal levers of criminal law and public health surveillance. Intimate platform design should not only work to reduce the prevalence and stigma of HIV, but also to contest historic and present power imbalances and injustices between users, platforms, and the state. <img src="https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729/71628" alt="End of article"></p>
+
+<p>&nbsp;</p>
+<a name="author"></a>
+<p><strong>About the authors</strong></p>
+
+<p><strong>Calvin Liang</strong> is a Ph.D. student in Human-Centered Design and Engineering Department at the University of Washington. His research broadly focuses on technology&rsquo;s role in and out of queerness, health, and queer health.<br>E-mail: cliang02 [at] uw [dot] edu</p>
+
+<p><strong>Jevan Alexander Hutson</strong>, living with HIV for four years, is a technology policy advocate, human-computer interaction researcher, and J.D. candidate at the University of Washington School of Law. His research interests center on issues of technology, law, and social life, with a particular focus on intimate/sexual computing.<br>E-mail: jevanh [at] uw [dot] edu</p>
+
+<p><strong>Os Keyes</strong> is a Ph.D. student in Human-Centered Design and Engineering at the University of Washington, and an inaugural Ada Lovelace Fellow. Their research examines gender, technology and (counter)power, with a particular focus on the ways technologies of measurement shape and define queer communities.<br>E-mail: okeyes [at] uw [dot] edu</p>
+
+<p>&nbsp;</p>
+<p><strong>Acknowledgements</strong></p>
+
+<p>We dedicate this paper to the radical history of the AIDS Coalition to Unleash Power (ACT UP) and to all of the souls we&rsquo;ve lost and continue to lose to HIV/AIDS. We would like to thank Mary Fan, Sean Munson, and Julie Kientz for valuable conversations and feedback, and Margret Wander and Margaret Hopkins for their ongoing care and support. This research was partially funded by a Microsoft Ada Lovelace Fellowship.</p>
+
+<p>&nbsp;</p>
+<p><strong>Notes</strong></p>
+
+<p><a name="1"></a><a href="#1a">1.</a> Halperin and Hoppe, 2017, p. 349.</p>
+
+<p><a name="2"></a><a href="#2a">2.</a> Jackson, <em>et al.</em>, 2014, p. 596.</p>
+
+<p><a name="3"></a><a href="#3a">3.</a> Fan, 2011, p. 36.</p>
+
+<p><a name="4"></a><a href="#4a">4.</a> Halperin and Hoppe, 2017, p. 255.</p>
+
+<p><a name="5"></a><a href="#5a">5.</a> See FLA. STAT. ANN. &sect; 775.0877 (2017) (&ldquo;[I]t is an affirmative defense to a charge of violating this section that the person exposed knew that the offender was infected with HIV, knew that the action being taken could result in transmission of the HIV infection, and consented to the action voluntarily with that knowledge.&rdquo;). See also <a href="http://www.hivlawandpolicy.org/states/florida">http://www.hivlawandpolicy.org/states/florida</a>.</p>
+
+<p><a name="6"></a><a href="#6a">6.</a> Serostatus is defined as: &ldquo;The state of either having or not having detectable antibodies against a specific antigen, as measured by a blood test (serologic test). For example, HIV seropositive means that a person has detectable antibodies to HIV; seronegative means that a person does not have detectable HIV antibodies.&rdquo; U.S. Department of Health &amp; Human Services, Education Materials, AIDSINFO, at <a href="https://aidsinfo.nih.gov/education-materials/glossary/1632/serostatus" target="_blank">https://aidsinfo.nih.gov/education-materials/glossary/1632/serostatus</a>, accessed 30 August 2019.</p>
+
+<p><a name="7"></a><a href="#7a">7.</a> Lehman, <em>et al.</em>, 2014, p. 1,101.</p>
+
+<p><a name="8"></a><a href="#8a">8.</a> Schram, 2019, p. 611.</p>
+
+<p><a name="9"></a><a href="#9a">9.</a> Halperin, 2015, p. 207.</p>
+
+<p>&nbsp;</p>
+<p><strong>References</strong></p>
+
+<p>Barry D. Adam, Richard Elliott, Patrice Corriveau, and Ken English, 2014. &ldquo;Impacts of criminalization on the everyday lives of people living with HIV in Canada,&rdquo; <em>Sexuality Research and Social Policy</em>, volume 11, number 1, pp. 39&ndash;49.<br>doi: <a href="https://doi.org/10.1007/s13178-013-0131-8" target="_blank">https://doi.org/10.1007/s13178-013-0131-8</a>, accessed 5 September 2020.</p>
+
+<p>Barry D. Adam, James Murray, Suzanne Ross, Jason Oliver, Stephen G. Lincoln, and Vicki Rynard, 2011. &ldquo;Hivstigma.com, an innovative Web-supported stigma reduction intervention for gay and bisexual men,&rdquo; <em>Health Education Research</em>, volume 26, number 5. pp. 795&ndash;807.<br>doi: <a href="https://doi.org/10.1093/her/cyq078" target="_blank">https://doi.org/10.1093/her/cyq078</a>, accessed 5 September 2020.</p>
+
+<p>Philip E. Agre, 1997. &ldquo;Toward a critical technical practice: Lessons learned in trying to reform AI,&rdquo; In: Geof Bowker, Les Gasser, Leigh Star, and Bill Turner (editors). <em>Bridging the great divide: Social science, technical systems, and cooperative work</em>. Mahwah, N.J.: Erlbaum.</p>
+
+<p>Anonymous, 2000. &ldquo;Name brands: The effects of intrusive HIV legislation on high-risk demographic groups,&rdquo; <em>Harvard Law Review</em>, volume 113, number 8, pp. 2,098&ndash;2,113.<br>doi: <a href="https://doi.org/10.2307/1342321" target="_blank">https://doi.org/10.2307/1342321</a>, accessed 5 September 2020.</p>
+
+<p>Taunya Lovell Banks, 1989. &ldquo;Women and AIDS &mdash; Racism, sexism, and classism,&rdquo; <em>New York University Review of Law &amp; Social Change</em>, volume 17, pp. 351&ndash;385, and at <a href="ttps://digitalcommons.law.umaryland.edu/fac_pubs/328" target="_blank">ttps://digitalcommons.law.umaryland.edu/fac_pubs/328</a>, accessed 5 September 2020.</p>
+
+<p>Shaowen Bardzell, 2010. &ldquo;Feminist HCI: Taking stock and outlining an agenda for design,&rdquo; <em>CHI &rsquo;10: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems</em>, pp. 1,301&ndash;1,310.<br>doi: <a href="https://doi.org/10.1145/1753326.1753521" target="_blank">https://doi.org/10.1145/1753326.1753521</a>, accessed 5 September 2020.</p>
+
+<p>Fran&ccedil;oise Barr&eacute;Sinoussi, Salim S. Abdool Karim, Jan Albert, LindaGail Bekker, Chris Beyrer, Pedro Cahn, Alexandra Calmy, Beatriz Grinsztejn, Andrew Grulich, Adeeba Kamarulzaman, Nagalingeswaran Kumarasamy, Mona R. Loutfy, Kamal M. El Filali, Souleymane Mboup, Julio S.G. Montaner, Paula Munderi, Vadim Pokrovsky, AnneMieke Vandamme, Benjamin Young, and Peter GodfreyFaussett, 2018. &ldquo;Expert consensus statement on the science of HIV in the context of criminal law,&rdquo; <em>Journal of the International AIDS Society</em>, volume 21, number 7.<br>doi: <a href="https://doi.org/10.1002/jia2.25161" target="_blank">https://doi.org/10.1002/jia2.25161</a>, accessed 5 September 2020.</p>
+
+<p>Eric P.S. Baumer and M. Six Silberman, 2011. &ldquo;When the implication is not to design (technology),&rdquo; <em>CHI &rsquo;11: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems</em>, pp. 2,271&ndash;2,274.<br>doi: <a href="https://doi.org/10.1145/1978942.1979275" target="_blank">https://doi.org/10.1145/1978942.1979275</a>, accessed 5 September 2020.</p>
+
+<p>Allan M Brandt, 1987. <em>No magic bullet: A social history of venereal disease in the United States since 1880</em>. Expanded edition. Oxford: Oxford University Press.</p>
+
+<p>Scott Burris and Edwin Cameron, 2008. &ldquo;The case against criminalization of HIV transmission,&rdquo; <em>Journal of the American Medical Association</em>, volume 300, number 5, pp. 578&ndash;581.<br>doi: <a href="https://doi.org/10.1001/jama.300.5.578" target="_blank">https://doi.org/10.1001/jama.300.5.578</a>, accessed 5 September 2020.</p>
+
+<p>Center for Disease Control and Prevention, 2019a. &ldquo;HIV and STD criminal laws,&rdquo; at <a href="https://www.cdc.gov/hiv/policies/law/states/exposure.html" target="_blank">https://www.cdc.gov/hiv/policies/law/states/exposure.html</a>, accessed 30 August 2019.</p>
+
+<p>Center for Disease Control and Prevention, 2019b. &ldquo;HIV surveillance reports,&rdquo; at <a href="https://www.cdc.gov/hiv/library/reports/hiv-surveillance.html" target="_blank">https://www.cdc.gov/hiv/library/reports/hiv-surveillance.html</a>, accessed 30 August 2019.</p>
+
+<p>Center for Disease Control and Prevention, 2019c. &ldquo;HIV and women,&rdquo; at <a href="https://www.cdc.gov/hiv/group/gender/women/" target="_blank">https://www.cdc.gov/hiv/group/gender/women/</a>, accessed 5 September 2020.</p>
+
+<p>Center for HIV Law &amp; Policy, 2019. &ldquo;HIV criminalization in The United States,&rdquo; at <a href="http://www.hivlawandpolicy.org/sourcebook" target="_blank">http://www.hivlawandpolicy.org/sourcebook</a>, accessed 2 February 2020.</p>
+
+<p>Hollie Clark, Aruna Surendera Babu, Ellen Weiss Wiewel, Jenevieve Opoku, and Nicole Crepaz, 2017. &ldquo;Diagnosed HIV infection in transgender adults and adolescents: Results from the National HIV Surveillance System, 2009&ndash;2014,&rdquo; <em>AIDS and Behavior</em>, volume 21 number 9, pp. 2,774&ndash;2,783.<br>doi: <a href="https://doi.org/10.1007/s10461-016-1656-7" target="_blank">https://doi.org/10.1007/s10461-016-1656-7</a>, accessed 5 September 2020.</p>
+
+<p>Sascha Cohen, 2018. &ldquo;How gay activists challenged the politics of civility,&rdquo; <em>Smithsonian Magazine</em> (10 July), at <a href="https://www.smithsonianmag.com/history/how-gay-activists-challenged-politics-civility-180969579/" target="_blank">https://www.smithsonianmag.com/history/how-gay-activists-challenged-politics-civility-180969579/</a>, accessed 5 September 2020.</p>
+
+<p>Valerian J. Derlega, Barbara A. Winstead, Kathryn Greene, Julianne Serovich, and William N. Elwood, 2004. &ldquo;Reasons for HIV disclosure/nondisclosure in close relationships: Testing a model of HIVdisclosure decision making,&rdquo; <em>Journal of Social and Clinical Psychology</em>, volume 23, number 6, pp. 747&ndash;767.<br>doi: <a href="https://doi.org/10.1521/jscp.23.6.747.54804" target="_blank">https://doi.org/10.1521/jscp.23.6.747.54804</a>, accessed 5 September 2020.</p>
+
+<p>Valerian J. Derlega, Barbara A. Winstead, Kathryn Greene, Julianne Serovich, and William N. Elwood, 2002. &ldquo;Perceived HIV-related stigma and HIV disclosure to relationship partners after finding out about the seropositive diagnosis,&rdquo; <em>Journal of Health Psychology</em>, volume 7, number 4, pp. 415&ndash;432.<br>doi: <a href="https://doi.org/10.1177/1359105302007004330" target="_blank">https://doi.org/10.1177/1359105302007004330</a>, accessed 5 September 2020.</p>
+
+<p>Lynn Dombrowski, Ellie Harmon, and Sarah Fox, 2016. &ldquo;Social justice-oriented interaction design: Outlining key design strategies and commitments,&rdquo; <em>DIS &rsquo;16: Proceedings of the 2016 ACM Conference on Designing Interactive Systems</em>, pp. 656&ndash;671.<br>doi: <a href="https://doi.org/10.1145/2901790.2901861" target="_blank">https://doi.org/10.1145/2901790.2901861</a>, accessed 5 September 2020.</p>
+
+<p>Robert W. Eisinger, Carl W. Dieffenbach, and Anthony S. Fauci, 2019. &ldquo;HIV viral load and transmissibility of HIV infection: Undetectable equals untransmittable,&rdquo; <em>Journal of the American Medical Association</em>, volume 321, number 5, pp. 451&ndash;452.<br>doi: <a href="https://doi.org/10.1001/jama.2018.21167" target="_blank">https://doi.org/10.1001/jama.2018.21167</a>, accessed 5 September 2020.</p>
+
+<p>Richard Elliot, 2002. &ldquo;Criminal law, public health and HIV transmission: A policy options paper,&rdquo; <em>UNAIDS (Joint United Nations Programme on HIV/AIDS)</em>, at <a href="https://data.unaids.org/publications/irc-pub02/jc733-criminallaw_en.pdf" target="_blank">https://data.unaids.org/publications/irc-pub02/jc733-criminallaw_en.pdf</a>, accessed 5 September 2020.</p>
+
+<p>Elizabeth F. Emens, 2008. &ldquo;Intimate discrimination: The state&rsquo;s role in the accidents of sex and love,&rdquo; <em>Harvard Law Review</em>, volume 122, number 5, pp. 1,307&ndash;1,402.<br>doi: <a href="https://doi.org/10.2307/40379752" target="_blank">https://doi.org/10.2307/40379752</a>, accessed 5 September 2020.</p>
+
+<p>Steven Epstein, 1996. <em>Impure science: AIDS, activism, and the politics of knowledge</em>. Berkeley: University of California Press.</p>
+
+<p>Amy L. Fairchild, Ronald Bayer, and James Colgrove, with Daniel Wolfe, 2007. <em>Searching eyes: Privacy, the state, and disease surveillance in America</em>. Berkeley: University of California Press.</p>
+
+<p>Mary D. Fan, 2012. &ldquo;Decentralizing STD surveillance: Toward better informed sexual consent,&rdquo; <em>Yale Journal of Health Policy, Law, and Ethics</em>, volume 12, number 1, pp. 1&ndash;38.</p>
+
+<p>Mary D. Fan, 2011. &ldquo;Sex, privacy, and public health in a casual encounters culture,&rdquo; <em>University of California Davis Law Review</em>, volume 25, pp. 531&ndash;596.</p>
+
+<p>Tim Fitzsimons, 2019. &ldquo;Inside Grindr, fears that China wanted to access user data via HIV research,&rdquo; <em>NBC News</em> (2 April), at <a href="https://www.nbcnews.com/feature/nbc-out/inside-grindr-fears-china-wanted-access-user-data-hiv-research-n989996" target="_blank">https://www.nbcnews.com/feature/nbc-out/inside-grindr-fears-china-wanted-access-user-data-hiv-research-n989996</a>, accessed 5 September 2020.</p>
+
+<p>Chandra L. Ford, Kathryn D. Whetten, Susan A. Hall, Jay S. Kaufman, and Angela D. Thrasher, 2007. &ldquo;Black sexuality, social construction, and research targeting &lsquo;The Down Low&rsquo; (&lsquo;The DL&rsquo;),&rdquo; <em>Annals of Epidemiology</em>, volume 17, number 3, pp. 209&ndash;216.<br>doi: <a href="https://doi.org/10.1016/j.annepidem.2006.09.006" target="_blank">https://doi.org/10.1016/j.annepidem.2006.09.006</a>, accessed 5 September 2020.</p>
+
+<p>A.J. Fortin, 1995. &ldquo;AIDS, surveillance, and public policy,&rdquo; <em>Research in Law and Policy Studies</em>, volume 4, pp. 173&ndash;197.</p>
+
+<p>Marilou Gagnon, 2012. &ldquo;Toward a critical response to HIV criminalization: Remarks on advocacy and social justice,&rdquo; <em>Journal of the Association of Nurses in AIDS Care</em>, volume 23, number 1, pp. 11&ndash;15.<br>doi: <a href="https://doi.org/10.1016/j.jana.2011.08.012" target="_blank">https://doi.org/10.1016/j.jana.2011.08.012</a>, accessed 5 September 2020.</p>
+
+<p>Carol L. Galletly and Steven D. Pinkerton, 2006. &ldquo;Conflicting messages: How criminal HIV disclosure laws undermine public health efforts to control the spread of HIV,&rdquo; <em>AIDS and Behavior</em>, volume 10, number 5, pp. 451&ndash;461.<br>doi: <a href="https://doi.org/10.1007/s10461-006-9117-3" target="_blank">https://doi.org/10.1007/s10461-006-9117-3</a>, accessed 5 September 2020.</p>
+
+<p>C. Galletly, Z. Lazzarini, C. Sanders, and S.D. Pinkerton, 2014. &ldquo;Criminal HIV exposure laws: Moving forward,&rdquo; <em>AIDS and Behavior</em>, volume 18, number 6, pp. 1,011&ndash;1,013.<br>doi: <a href="https://doi.org/10.1007/s10461-014-0731-1" target="_blank">https://doi.org/10.1007/s10461-014-0731-1</a>, accessed 5 September 2020.</p>
+
+<p>Robert C. Gallo, 2006. &ldquo;A reflection on HIV/AIDS research after 25 years,&rdquo; <em>Retrovirology</em>, volume 3, article number 72.<br>doi: <a href="https://doi.org/10.1186/1742-4690-3-72" target="_blank">https://doi.org/10.1186/1742-4690-3-72</a>, accessed 5 September 2020.</p>
+
+<p>George Gallup, Jr. and Jim Castelli, 1987. &ldquo;Poll catalogs views on AIDS by religion,&rdquo; <em>Dallas Morning News</em> (27 September), p. 45A.</p>
+
+<p>Lawrence O. Gostin, Scott Burris, and Zita Lazzarini, 1999. &ldquo;The law and the public&rsquo;s health: A study of infectious disease law in the United States,&rdquo; <em>Columbia Law Review</em>, volume 99, number 1, pp. 59&ndash;128.</p>
+
+<p>Ben Green, 2018. &ldquo;Data science as political action: Grounding data science in a politics of justice,&rdquo; <em>arXiv</em>:1811.03435 (6 November), at <a href="https://arxiv.org/abs/1811.03435" target="_blank">https://arxiv.org/abs/1811.03435</a>, accessed 5 September 2020.</p>
+
+<p>Kathryn Greene, Valerian J. Derlega, Gust A. Yep, and Sandra Petronio, 2003. <em>Privacy and disclosure of HIV in interpersonal relationships: A sourcebook for researchers and practitioners</em>. Mahwah, N.J.: Lawrence Erlbaum Associates.</p>
+
+<p>David M. Halperin, 2015. &ldquo;The biopolitics of HIV prevention discourse,&rdquo; In: Vernon W. Cisney and Nicolae Morar (editors). <em>Biopower: Foucault and beyond</em>. Chicago: University of Chicago Press, pp. 199&ndash;227.</p>
+
+<p>David M. Halperin and Trevor Hoppe (editors), 2017. <em>The war on sex</em>. Durham, N.C.: Duke University Press.</p>
+
+<p>Mark J. Handel and Irina Shklovski, 2012. &ldquo;Disclosure, ambiguity and risk reduction in real-time dating sites,&rdquo; <em>GROUP &rsquo;12: Proceedings of the 17th ACM International Conference on Supporting Group Work</em>, pp. 175&ndash;178.<br>doi: <a href="https://doi.org/10.1145/2389176.2389203" target="_blank">https://doi.org/10.1145/2389176.2389203</a>, accessed 5 September 2020.</p>
+
+<p>Jean Hardy and Silvia Lindtner, 2017. &ldquo;Constructing a desiring user: Discourse, rurality, and design in location-based social networks,&rdquo; <em>CSCW &rsquo;17: Proceedings of the 2017 ACM Conference on Computer Supported Cooperative Work and Social Computing</em>, pp. 13&ndash;25.<br>doi: <a href="https://doi.org/10.1145/2998181.2998347" target="_blank">https://doi.org/10.1145/2998181.2998347</a>, accessed 5 September 2020.</p>
+
+<p>Dini Harsono, Carol L. Galletly, Elaine O&rsquo;Keefe, and Zita Lazzarini, 2017. &ldquo;Criminalization of HIV exposure: A review of empirical studies in the United States,&rdquo; <em>AIDS and Behavior</em>, volume 21, no. 1, pp. 27&ndash;50.<br>doi: <a href="https://doi.org/10.1007/s10461-016-1540-5" target="_blank">https://doi.org/10.1007/s10461-016-1540-5</a>, accessed 5 September 2020.</p>
+
+<p>Trevor Hoppe, 2018. <em>Punishing disease: HIV and the criminalization of sickness</em>. Berkeley: University of California Press.</p>
+
+<p>Hsiu-Fang Hsieh and Sarah E. Shannon, 2005. &ldquo;Three approaches to qualitative content analysis,&rdquo; <em>Qualitative Health Research</em>, volume 15, number 9, pp. 1,277&ndash;1,288.<br>doi: <a href="https://doi.org/10.1177/1049732305276687" target="_blank">https://doi.org/10.1177/1049732305276687</a>, accessed 5 September 2020.</p>
+
+<p>Jevan A. Hutson, Jessie G. Taft, Solon Barocas, and Karen Levy, 2018. &ldquo;Debiasing desire: Addressing bias &amp; discrimination on intimate platforms,&rdquo; <em>Proceedings of the ACM on Human-Computer Interaction</em>, article number 73.<br>doi: <a href="https://doi.org/10.1145/3274342" target="_blank">https://doi.org/10.1145/3274342</a>, accessed 5 September 2020.</p>
+
+<p>Lilly Irani, Janet Vertesi, Paul Dourish, Kavita Philip, and Rebecca E. Grinter, 2010. &ldquo;Postcolonial computing: A lens on design and development,&rdquo; <em>CHI &rsquo;10: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems</em>, pp. 1,311&ndash;1,320.<br>doi: <a href="https://doi.org/10.1145/1753326.1753522" target="_blank">https://doi.org/10.1145/1753326.1753522</a>, accessed 5 September 2020.</p>
+
+<p>Steven J. Jackson, Tarleton Gillespie, and Sandy Payette, 2014. &ldquo;The policy knot: Re-integrating policy, practice and design in cscw studies of social computing,&rdquo; <em>CSCW &rsquo;14: Proceedings of the 17th ACM Conference on Computer Supported Cooperative Work &amp; Social Computing</em>, pp. 588&ndash;602.<br>doi: <a href="https://doi.org/10.1145/2531602.2531674" target="_blank">https://doi.org/10.1145/2531602.2531674</a>, accessed 5 September 2020.</p>
+
+<p>Paula C. Johnson, 1992. &ldquo;Silence equals death: The response to AIDS within communities of color,&rdquo; <em>University of Illinois Law Review</em>, volume 1992, pp. 1,075&ndash;1,083.</p>
+
+<p>Ralf J&uuml;rgens, Jonathan Cohen, Edwin Cameron, Scott Burris, Michaela Clayton, Richard Elliott, Richard Pearshouse, Anne Gathumbi, and Delme Cupido, 2009. &ldquo;Ten reasons to oppose the criminalization of HIV exposure or transmission,&rdquo; <em>Reproductive Health Matters</em>, volume 17, number 34, pp. 163&ndash;172.<br>doi: <a href="https://doi.org/10.1016/S0968-8080(09)34462-6" target="_blank">https://doi.org/10.1016/S0968-8080(09)34462-6</a>, accessed 5 September 2020.</p>
+
+<p>Gopinaath Kannabiran, Shaowen Bardzell, and Jeffrey Bardzell, 2012. &ldquo;Designing (for) desire: a critical study of technosexuality in HCI,&rdquo; <em>NordiCHI &rsquo;12: Proceedings of the Seventh Nordic Conference on Human-Computer Interaction: Making Sense Through Design</em>, pp. 655&ndash;664.<br>doi: <a href="https://doi.org/10.1145/2399016.2399116" target="_blank">https://doi.org/10.1145/2399016.2399116</a>, accessed 5 September 2020.</p>
+
+<p>C&eacute;cile Kazatchkine, Edwin Bernard, and Patrick Eba, 2015. &ldquo;Ending overly broad HIV criminalization: Canadian scientists and clinicians stand for justice,&rdquo; <em>Journal of the International AIDS Society</em>, volume 18, number 1, pp. 201&ndash;226.<br>doi: <a href="https://doi.org/10.7448/IAS.18.1.20126" target="_blank">https://doi.org/10.7448/IAS.18.1.20126</a>, accessed 5 September 2020.</p>
+
+<p>Os Keyes, Jevan Hutson, and Meredith Durbin, 2019. &ldquo;A mulching proposal: Analysing and improving an algorithmic system for turning the elderly into high-nutrient slurry,&rdquo; <em>CHI EA &rsquo;19: Extended Abstracts of the 2019 CHI Conference on Human Factors in Computing Systems</em>, paper number alt06.<br>doi: <a href="https://doi.org/10.1145/3290607.3310433" target="_blank">https://doi.org/10.1145/3290607.3310433</a>, accessed 5 September 2020.</p>
+
+<p>Jeffrey V. Lazarus, Kelly Safreed-Harmon, Simon E. Barton, Dominique Costagliola, Nikos Dedes, Julia del Amo Valero, Jose M. Gatell, Ricardo Baptista-Leite, Lus Mend&atilde;o, Kholoud Porter, Stefano Vella, and J&uuml;rgen Kurt Rockstroh, 2016. &ldquo;Beyond viral suppression of HIV &mdash; The new quality of life frontier,&rdquo; <em>BMC Medicine</em>, volume 14, number 1, article number 94.<br>doi: <a href="https://doi.org/10.1186/s12916-016-0640-4" target="_blank">https://doi.org/10.1186/s12916-016-0640-4</a>, accessed 5 September 2020.</p>
+
+<p>J. Stan Lehman, Meredith H. Carr, Allison J. Nichol, Alberto Ruisanchez, David W. Knight, Anne E. Langford, Simone C. Gray, and Jonathan H. Mermin, 2014. &ldquo;Prevalence and public health implications of state laws that criminalize potential HIV exposure in the United States,&rdquo; <em>AIDS and Behavior</em>, volume 18, number 6, pp.997&ndash;1,006.<br>doi: <a href="https://doi.org/10.1007/s10461-014-0724-0" target="_blank">https://doi.org/10.1007/s10461-014-0724-0</a>, accessed 5 September 2020.</p>
+
+<p>Karen Levy and Solon Barocas, 2018. &ldquo;Designing against discrimination in online markets,&rdquo; <em>Berkeley Technology Law Journal</em>, volume 32, number 3, pp. 1,183&ndash;1,237.<br>doi: <a href="https://doi.org/10.15779/Z38BV79V7K" target="_blank">https://doi.org/10.15779/Z38BV79V7K</a>, accessed 5 September 2020.</p>
+
+<p>Eric Lichtblau and William M. Arkin, 2014. &ldquo;More federal agencies are using undercover operations,&rdquo; <em>New York Times</em> (15 November), at <a href="https://www.nytimes.com/2014/11/16/us/more-federal-agencies-are-using-undercover-operations.html" target="_blank">https://www.nytimes.com/2014/11/16/us/more-federal-agencies-are-using-undercover-operations.html</a>, accessed 5 September 2020.</p>
+
+<p>Ann Light, 2011. &ldquo;HCI as heterodoxy: Technologies of identity and the queering of interaction with computers,&rdquo; <em>Interacting with Computers</em>, volume 23, number 5, pp. 430&ndash;438.<br>doi: <a href="https://doi.org/10.1016/j.intcom.2011.02.002" target="_blank">https://doi.org/10.1016/j.intcom.2011.02.002</a>, accessed 5 September 2020.</p>
+
+<p>Ben Light, Jean Burgess, and Stefanie Duguay, 2018. &ldquo;The walkthrough method: An approach to the study of apps,&rdquo; <em>New Media &amp; Society</em>, volume 20, number 3, pp. 881&ndash;900.<br>doi: <a href="https://doi.org/10.1177/1461444816675438" target="_blank">https://doi.org/10.1177/1461444816675438</a>, accessed 5 September 2020.</p>
+
+<p>Anish P. Mahajan, Jennifer N. Sayles, Vishal A. Patel, Robert H. Remien, Daniel Ortiz, Greg Szekeres, and Thomas J. Coates, 2008. &ldquo;Stigma in the HIV/AIDS epidemic: A review of the literature and recommendations for the way forward,&rdquo; <em>AIDS</em>, volume 22, supplement 2, pp. S67&ndash;S79.<br>doi: <a href="https://doi.org/10.1097/01.aids.0000327438.13291.62" target="_blank">https://doi.org/10.1097/01.aids.0000327438.13291.62</a>, accessed 5 September 2020.</p>
+
+<p>Alexandra McCallum, 2014. &ldquo;Criminalizing the transmission of HIV: Consent, disclosure, and online dating,&rdquo; <em>Utah Law Review</em>, volume 2014, number 3, article 5, at <a href="https://dc.law.utah.edu/ulr/vol2014/iss3/5" target="_blank">https://dc.law.utah.edu/ulr/vol2014/iss3/5</a>, accessed 5 September 2020.</p>
+
+<p>Donna Hubbard McCree and Matthew Hogben, 2010. &ldquo;The contribution to and context of other sexually transmitted diseases and tuberculosis in the HIV/AIDS epidemic among African Americans,&rdquo; In: Donna Hubbard McCree, Kenneth Jones, and Ann O&rsquo;Leary (editors). <em>African Americans and HIV/AIDS: Understanding and addressing the epidemic</em>, New York: Springer, pp. 3&ndash;12.<br>doi: <a href="https://doi.org/10.1007/978-0-387-78321-5_1" target="_blank">https://doi.org/10.1007/978-0-387-78321-5_1</a>, accessed 5 September 2020.</p>
+
+<p>William C. Miller, Carol A. Ford, Martina Morris, Mark S. Handcock, John L. Schmitz, Marcia M. Hobbs, Myron S. Cohen, Kathleen Mullan Harris, and J. Richard Udry, 2004. &ldquo;Prevalence of chlamydial and gonococcal infections among young adults in the United States,&rdquo; <em>Journal of the American Medical Association</em>, volume 291, number 18, pp. 2,229&ndash;2,236.<br>doi: <a href="https://doi.org/10.1007/978-0-387-78321-5_1" target="_blank">https://doi.org/10.1007/978-0-387-78321-5_1</a>, accessed 5 September 2020.</p>
+
+<p>Viviane Namaste, 2015. <em>Oversight: Critical reflections on feminist research and politics</em>. Toronto: Women&rsquo;s Press.</p>
+
+<p>Angela Perone, 2013. &ldquo;From punitive to proactive: An alternative approach for responding to HIV criminalization that departs from penalizing marginalized communities,&rdquo; <em>Hastings Women&rsquo;s Law Journal</em>, volume 24, pp. 363&ndash;406, and at <a href="https://repository.uchastings.edu/hwlj/vol24/iss2/5" target="_blank">https://repository.uchastings.edu/hwlj/vol24/iss2/5</a>, accessed 5 September 2020.</p>
+
+<p>Deana A. Pollard, 2006. &ldquo;Sex torts,&rdquo; <em>Minnesota Law Review</em>, volume 91, pp. 769&ndash;824, and at <a href="https://www.minnesotalawreview.org/wp-content/uploads/2012/01/Pollard_Final.pdf" target="_blank">https://www.minnesotalawreview.org/wp-content/uploads/2012/01/Pollard_Final.pdf</a>, accessed 5 September 2020.</p>
+
+<p>POZ, 2015. &ldquo;Man with HIV arrested for seeking sex on social media&rdquo;(22 July 22), at <a href="https://www.poz.com/article/stlouis-hiv-arrest-27534-4846" target="_blank">https://www.poz.com/article/stlouis-hiv-arrest-27534-4846</a>, accessed 5 September 2020.</p>
+
+<p>Russell K. Robinson, 2007. &ldquo;Structural dimensions of romantic preferences,&rdquo; <em>Fordham Law Review</em>, volume 76, pp. 2,787&ndash;2,820, and at <a href="http://fordhamlawreview.org/issues/structural-dimensions-of-romantic-preferences/" target="_blank">http://fordhamlawreview.org/issues/structural-dimensions-of-romantic-preferences/</a>, accessed 5 September 2020.</p>
+
+<p>Michael J. Rosenfeld and Reuben J. Thomas, 2012. &ldquo;Searching for a mate: The rise of the Internet as a social intermediary,&rdquo; <em>American Sociological Review</em>, volume 77, number 4, pp. 523&ndash;547.<br>doi: <a href="https://doi.org/10.1177/0003122412448050" target="_blank">https://doi.org/10.1177/0003122412448050</a>, accessed 5 September 2020.</p>
+
+<p>B.R. Simon Rosser, J. Michael Wilkerson, Derek J. Smolenski, J. Michael Oakes, Joseph Konstan, Keith J. Horvath, Gunna R. Kilian, David S. Novak, Gene P. Danilenko, and Richard Morgan, 2011. &ldquo;The future of Internet-based HIV prevention: A report on key findings from the Men&rsquo;s INTernet (MINTS-I, II) Sex Studies,&rdquo; <em>AIDS and Behavior</em>, volume 15, supplement 1, pp. S91&ndash;S100.<br>doi: <a href="https://doi.org/10.1007/s10461-011-9910-5" target="_blank">https://doi.org/10.1007/s10461-011-9910-5</a>, accessed 5 September 2020.</p>
+
+<p>Brian Schram, 2019. &ldquo;Accidental orientations: Rethinking queerness in archival times,&rdquo; <em>Surveillance &amp; Society</em>, volume 17, number 5, pp. 602&ndash;617.<br>doi: <a href="https://doi.org/10.24908/ss.v17i5.8688" target="_blank">https://doi.org/10.24908/ss.v17i5.8688</a>, accessed 5 September 2020.</p>
+
+<p>Junichi P. Semitsu, 2011. &ldquo;From Facebook to mug shot: How the dearth of social networking privacy rights revolutionized online government surveillance,&rdquo; <em>Pace Law Review</em>, volume 31, number 1, pp. 291&ndash;381, and at <a href="https://digitalcommons.pace.edu/plr/vol31/iss1/7" target="_blank">https://digitalcommons.pace.edu/plr/vol31/iss1/7</a>, accessed 5 September 2020.</p>
+
+<p>Sero Project, 2012, &ldquo;National criminalization survey preliminary results,&rdquo; (25 July), at <a href="https://toolkit.hivjusticeworldwide.org/resource/the-sero-project-national-criminalization-survey-preliminary-results-2/" target="_blank">https://toolkit.hivjusticeworldwide.org/resource/the-sero-project-national-criminalization-survey-preliminary-results-2/</a>, accessed 30 August 2019.</p>
+
+<p>Julianne M. Serovich and Katie E. Mosack, 2003. &ldquo;Reasons for HIV disclosure or nondisclosure to casual sexual partners,&rdquo; <em>AIDS Education and Prevention</em>, volume 15, number 1, pp. 70&ndash;80.</p>
+
+<p>Natasha Singer, 2018. &ldquo;Grindr sets off privacy firestorm after sharing users&rsquo; H.I.V.-status data,&rdquo; <em>New York Times</em> (3 April), at <a href="https://www.nytimes.com/2018/04/03/technology/grindr-sets-off-privacy-firestorm-after-sharing-users-hiv-status-data.html" target="_blank">https://www.nytimes.com/2018/04/03/technology/grindr-sets-off-privacy-firestorm-after-sharing-users-hiv-status-data.html</a>, accessed 5 September 2020.</p>
+
+<p>Lucy Suchman, 2011. &ldquo;Anthropological relocations and the limits of design,&rdquo; <em>Annual Review of Anthropology</em>, volume 40, pp. 1&ndash;18.<br>doi: <a href="https://doi.org/10.1146/annurev.anthro.041608.105640" target="_blank">https://doi.org/10.1146/annurev.anthro.041608.105640</a>, accessed 5 September 2020.</p>
+
+<p>Cass R. Sunstein, 1996. &ldquo;Social norms and social roles,&rdquo; <em>Columbia Law Review</em>, volume 96, number 4, pp. 903&ndash;968.</p>
+
+<p>Patricia Sweeney, Simone C. Gray, David W. Purcell, Jenny Sewell, Aruna Surendera Babu, Brett A. Tarver, Joseph Prejean, and Jonathan Mermin, 2017. &ldquo;Association of HIV diagnosis rates and laws criminalizing HIV exposure in the United States,&rdquo; <em>AIDS</em>, volume 31, number 10, pp. 1,483&ndash;1,488.<br>doi: <a href="https://doi.org/10.1097/QAD.0000000000001501" target="_blank">https://doi.org/10.1097/QAD.0000000000001501</a>, accessed 5 September 2020.</p>
+
+<p>Bryan L. Sykes, Trevor A. Hoppe, and Kristen D. Maziarka, 2016. &ldquo;Cruel intentions? HIV prevalence and criminalization during an age of mass incarceration, U.S. 1999 to 2012,&rdquo; <em>Medicine (Baltimore)</em>, volume 95, number 16, e3352.<br>doi: <a href="https://doi.org/10.1097/MD.0000000000003352" target="_blank">https://doi.org/10.1097/MD.0000000000003352</a>, accessed 5 September 2020.</p>
+
+<p>Samuel Hardman Taylor, Jevan Alexander Hutson, and Tyler Richard Alicea, 2017. &ldquo;Social consequences of Grindr use: Extending the Internet-enhanced self-disclosure hypothesis,&rdquo; <em>CHI &rsquo;17: Proceedings of the 2017 CHI Conference on Human Factors in Computing Systems</em>, pp. 6,645&ndash;6,657.<br>doi: <a href="https://doi.org/10.1145/3025453.3025775" target="_blank">https://doi.org/10.1145/3025453.3025775</a>, accessed 5 September 2020.</p>
+
+<p>Steven Thrasher, 2015. &ldquo;A Black body on trial: The conviction of HIV-positive &lsquo;Tiger Mandingo&rsquo;,&rdquo; <em>BuzzFeed News</em> (30 November), at <a href="https://www.buzzfeednews.com/article/steventhrasher/a-black-body-on-trial-the-conviction-of-hiv-positive-tiger-m" target="_blank">https://www.buzzfeednews.com/article/steventhrasher/a-black-body-on-trial-the-conviction-of-hiv-positive-tiger-m</a>, accessed 5 September 2020.</p>
+
+<p>Liming Wang, Dylan Podson, Zihuang Chen, Hongyan Lu, Vania Wang, Colin Shepard, John K. Williams, and Guodong Mi, 2019. &ldquo;Using social media to increase HIV testing among men who have sex with men &mdash; Beijing, China, 2013&ndash;2017,&rdquo; <em>Morbidity and Mortality Weekly Report</em>, volume 68, number 21, pp. 478&ndash;482.<br>doi: <a href="http://dx.doi.org/10.15585/mmwr.mm6821a3" target="_blank">http://dx.doi.org/10.15585/mmwr.mm6821a3</a>, accessed 5 September 2020.</p>
+
+<p>Helen Ward. 2005. &ldquo;Partner notification and contact-tracing,&rdquo; <em>Medicine</em>, volume 33, number 9, pp. 28&ndash;30.<br>doi: <a href="https://doi.org/10.1383/medc.2005.33.9.28" target="_blank">https://doi.org/10.1383/medc.2005.33.9.28</a>, accessed 5 September 2020.</p>
+
+<p>Helen Ward and Gill Bell, 2014. &ldquo;Partner notification,&rdquo; <em>Medicine (Abingdon)</em>, volume 42, number 6, pp. 314&ndash;317.<br>doi: <a href="https://doi.org/10.1016/j.mpmed.2014.03.013" target="_blank">https://doi.org/10.1016/j.mpmed.2014.03.013</a>, accessed 5 September 2020.</p>
+
+<p>Mark Warner, Andreas Gutmann, M. Angela Sasse, and Ann Blandford, 2018. &ldquo;Privacy unraveling around explicit HIV status disclosure fields in the online geosocial hookup app Grindr,&rdquo; <em>Proceedings of the ACM on Human-Computer Interaction</em>, article number 181.<br>doi: <a href="https://doi.org/10.1145/3274450" target="_blank">https://doi.org/10.1145/3274450</a>, accessed 5 September 2020.</p>
+
+<p>Mark Warner, Juan F. Maestre, Jo Gibbs, Chia-Fang Chung, and Ann Blandford, 2019. &ldquo;Signal appropriation of explicit HIV status disclosure fields in sex-social apps used by gay and bisexual men,&rdquo; <em>CHI &rsquo;19: Proceedings of the 2019 CHI Conference on Human Factors in Computing Systems</em>, paper number 692.<br>doi: <a href="https://doi.org/10.1145/3290605.3300922" target="_blank">https://doi.org/10.1145/3290605.3300922</a>, accessed 5 September 2020.</p>
+
+<p>Dylan Eric Wittkower, 2016. &ldquo;Lurkers, creepers, and virtuous interactivity: From property rights to consent to care as a conceptual basis for privacy concerns and information ethics,&rdquo; <em>First Monday</em>, volume 21, number 10, at <a href="https://firstmonday.org/article/view/6948/5628" target="_blank">https://firstmonday.org/article/view/6948/5628</a>, accessed 5 September 2020.<br>doi: <a href="https://doi.org/10.5210/fm.v21i10.6948" target="_blank">https://doi.org/10.5210/fm.v21i10.6948</a>, accessed 5 September 2020.</p>
+
+<p>Dan Wohlfeiler, Jennifer Hecht, Jonathan Volk, H. Fisher Raymond, Tom Kennedy, and Willi McFarland, 2013. &ldquo;How can we improve online HIV and STD prevention for men who have sex with men? Perspectives of hook-up website owners, website users, and HIV/STD directors,&rdquo; <em>AIDS and Behavior</em>, volume 17, number 9, pp. 3,024&ndash;3,033.<br>doi: <a href="https://doi.org/10.1007/s10461-012-0375-y" target="_blank">https://doi.org/10.1007/s10461-012-0375-y</a>, accessed 5 September 2020.</p>
+
+<p>Mara Cecilia Zea, Carol A. Reisen, Paul J. Poppen, and Rafael M. Daz. 2003. &ldquo;Asking and telling: communication about HIV status among Latino HIV-positive gay men,&rdquo; <em>AIDS and Behavior</em>, volume 7, number 2, pp. 143&ndash;152.<br>doi: <a href="https://doi.org/10.1023/A:1023994207984" target="_blank">https://doi.org/10.1023/A:1023994207984</a>, accessed 5 September 2020.</p>
+
+<p>Shoshana Zuboff, 2019. <em>The age of surveillance capitalism: The fight for a human future at the new frontier of power</em>. London: Profile Books.</p>
+
+<p>&nbsp;</p>
+<hr width="300">
+
+<p><strong>Editorial history</strong></p>
+<p>Received 17 October 2019; revised 12 February 2020; accepted 28 August 2020.</p>
+
+<hr>
+
+<p><a href="http://creativecommons.org/licenses/by/4.0/"><img alt="Creative Commons License" src="https://i.creativecommons.org/l/by/4.0/80x15.png"></a><br>This paper is licensed under a <a href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>.</p>
+
+<p>Surveillance, stigma &amp; sociotechnical design for HIV<br>by Calvin Liang, Jevan Alexander Hutson, and Os Keyes.<br><em>First Monday</em>, Volume 25, Number 10 - 5 October 2020<br>https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729<br>doi: <a href="http://dx.doi.org/10.5210/fm.v25i10.10274" target="_blank">http://dx.doi.org/10.5210/fm.v25i10.10274</a></p>
+</blockquote>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/first_monday_ojs3_landingpage.html b/python/tests/files/first_monday_ojs3_landingpage.html
new file mode 100644
index 0000000..2633256
--- /dev/null
+++ b/python/tests/files/first_monday_ojs3_landingpage.html
@@ -0,0 +1,616 @@
+ <!DOCTYPE html>
+<html lang="en-US" xml:lang="en-US">
+<head>
+ <meta charset="utf-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <title>
+ Surveillance, stigma &amp; sociotechnical design for HIV
+ | First Monday
+ </title>
+
+
+<meta name="generator" content="Open Journal Systems 3.1.2.0">
+<link rel="icon" href="https://firstmonday.org/ojs/public/journals/3/favicon_en_US.gif">
+<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
+<meta name="DC.Coverage" xml:lang="en" content=""/>
+<meta name="DC.Creator.PersonalName" content="Calvin Liang"/>
+<meta name="DC.Creator.PersonalName" content="Jevan Alexander Hutson"/>
+<meta name="DC.Creator.PersonalName" content="Os Keyes"/>
+<meta name="DC.Date.created" scheme="ISO8601" content="2020-09-10"/>
+<meta name="DC.Date.dateSubmitted" scheme="ISO8601" content="2019-09-15"/>
+<meta name="DC.Date.issued" scheme="ISO8601" content="2020-10-01"/>
+<meta name="DC.Date.modified" scheme="ISO8601" content="2020-10-01"/>
+<meta name="DC.Description" xml:lang="en" content="Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work†for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."/>
+<meta name="DC.Format" scheme="IMT" content="text/html"/>
+<meta name="DC.Identifier" content="10274"/>
+<meta name="DC.Identifier.DOI" content="10.5210/fm.v25i10.10274"/>
+<meta name="DC.Identifier.URI" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274"/>
+<meta name="DC.Language" scheme="ISO639-1" content="en"/>
+<meta name="DC.Rights" content="Copyright (c) 2020 First Monday"/>
+<meta name="DC.Rights" content=""/>
+<meta name="DC.Source" content="First Monday"/>
+<meta name="DC.Source.ISSN" content="1396-0466"/>
+<meta name="DC.Source.URI" content="https://firstmonday.org/ojs/index.php/fm"/>
+<meta name="DC.Subject" xml:lang="en" content="HIV"/>
+<meta name="DC.Subject" xml:lang="en" content="online dating"/>
+<meta name="DC.Subject" xml:lang="en" content="design"/>
+<meta name="DC.Subject" xml:lang="en" content="policy"/>
+<meta name="DC.Subject" xml:lang="en" content="surveillance"/>
+<meta name="DC.Subject" xml:lang="en" content="intimacy"/>
+<meta name="DC.Subject" xml:lang="en" content="social computing"/>
+<meta name="DC.Subject" xml:lang="en" content="social justice"/>
+<meta name="DC.Title" content="Surveillance, stigma &amp; sociotechnical design for HIV"/>
+<meta name="DC.Type" content="Text.Serial.Journal"/>
+<meta name="DC.Type" xml:lang="en" content="Qualitative; Content analysis"/>
+<meta name="DC.Type.articleType" content="Articles"/>
+<meta name="gs_meta_revision" content="1.1"/>
+<meta name="citation_journal_title" content="First Monday"/>
+<meta name="citation_journal_abbrev" content="1"/>
+<meta name="citation_issn" content="1396-0466"/>
+<meta name="citation_author" content="Calvin Liang"/>
+<meta name="citation_author_institution" content="University of Washington, Department of Human Centered Design &amp; Engineering"/>
+<meta name="citation_author" content="Jevan Alexander Hutson"/>
+<meta name="citation_author_institution" content="University of Washington, School of Law"/>
+<meta name="citation_author" content="Os Keyes"/>
+<meta name="citation_author_institution" content="University of Washington, Department of Human Centered Design &amp; Engineering"/>
+<meta name="citation_title" content="Surveillance, stigma &amp; sociotechnical design for HIV"/>
+<meta name="citation_date" content="2020/09/10"/>
+<meta name="citation_doi" content="10.5210/fm.v25i10.10274"/>
+<meta name="citation_abstract_html_url" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274"/>
+<meta name="citation_language" content="en"/>
+<meta name="citation_keywords" xml:lang="en" content="HIV"/>
+<meta name="citation_keywords" xml:lang="en" content="online dating"/>
+<meta name="citation_keywords" xml:lang="en" content="design"/>
+<meta name="citation_keywords" xml:lang="en" content="policy"/>
+<meta name="citation_keywords" xml:lang="en" content="surveillance"/>
+<meta name="citation_keywords" xml:lang="en" content="intimacy"/>
+<meta name="citation_keywords" xml:lang="en" content="social computing"/>
+<meta name="citation_keywords" xml:lang="en" content="social justice"/>
+<meta name="citation_fulltext_html_url" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"/>
+<link rel="alternate" type="application/atom+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/atom">
+<link rel="alternate" type="application/rdf+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss">
+<link rel="alternate" type="application/rss+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss2">
+ <link rel="stylesheet" href="https://firstmonday.org/ojs/index.php/fm/$$$call$$$/page/page/css?name=stylesheet" type="text/css" /><link rel="stylesheet" href="//fonts.googleapis.com/css?family=Noto+Sans:400,400italic,700,700italic" type="text/css" /><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.css" type="text/css" /><link rel="stylesheet" href="https://firstmonday.org/ojs/public/journals/3/styleSheet.css" type="text/css" />
+</head>
+<body class="pkp_page_article pkp_op_view has_site_logo" dir="ltr">
+
+ <div class="cmp_skip_to_content">
+ <a href="#pkp_content_main">Skip to main content</a>
+ <a href="#pkp_content_nav">Skip to main navigation menu</a>
+ <a href="#pkp_content_footer">Skip to site footer</a>
+ </div>
+ <div class="pkp_structure_page">
+
+ <header class="pkp_structure_head" id="headerNavigationContainer" role="banner">
+ <div class="pkp_head_wrapper">
+
+ <div class="pkp_site_name_wrapper">
+ <div class="pkp_site_name">
+ <a href=" https://firstmonday.org/ojs/index.php/fm/index
+ " class="is_img">
+ <img src="https://firstmonday.org/ojs/public/journals/3/pageHeaderLogoImage_en_US.gif" width="252" height="102" alt="Page Header Logo" />
+ </a>
+ </div>
+ </div>
+
+
+ <nav class="pkp_navigation_primary_row" aria-label="Site Navigation">
+ <div class="pkp_navigation_primary_wrapper">
+ <ul id="navigationPrimary" class="pkp_navigation_primary pkp_nav_list">
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about">
+ About
+ </a>
+ <ul>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about">
+ About the Journal
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/editorialTeam">
+ Editorial Team
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/privacy">
+ Privacy Statement
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/contact">
+ Contact
+ </a>
+ </li>
+ </ul>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/search/search">
+ Search
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/current">
+ Current
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/archive">
+ Archives
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/announcement">
+ Announcements
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/submissions">
+ Submissions
+ </a>
+ </li>
+ </ul>
+
+
+
+ <form class="pkp_search" action="https://firstmonday.org/ojs/index.php/fm/search/search" method="post" role="search">
+ <input type="hidden" name="csrfToken" value="671acac3a608346eb0eb4de1f26c7563">
+ <input name="query" value="" type="text" aria-label="Search Query">
+ <button type="submit">
+ Search
+ </button>
+ <div class="search_controls" aria-hidden="true">
+ <a href="https://firstmonday.org/ojs/index.php/fm/search/search" class="headerSearchPrompt search_prompt" aria-hidden="true">
+ Search
+ </a>
+ <a href="#" class="search_cancel headerSearchCancel" aria-hidden="true"></a>
+ <span class="search_loading" aria-hidden="true"></span>
+ </div>
+</form>
+ </div>
+ </nav>
+
+ <nav class="pkp_navigation_user_wrapper" id="navigationUserWrapper" aria-label="User Navigation">
+ <ul id="navigationUser" class="pkp_navigation_user pkp_nav_list">
+ <li class="profile">
+ <a href="https://firstmonday.org/ojs/index.php/fm/user/register">
+ Register
+ </a>
+ </li>
+ <li class="profile">
+ <a href="https://firstmonday.org/ojs/index.php/fm/login">
+ Login
+ </a>
+ </li>
+ </ul>
+
+ </nav>
+ </div><!-- .pkp_head_wrapper -->
+ </header><!-- .pkp_structure_head -->
+
+ <div class="pkp_structure_content has_sidebar">
+ <div id="pkp_content_main" class="pkp_structure_main" role="main">
+
+<div class="page page_article">
+ <nav class="cmp_breadcrumbs" role="navigation" aria-label="You are here:">
+ <ol>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/index">
+ Home
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/archive">
+ Archives
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ Volume 25, Number 10 - 5 October 2020
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li class="current">
+ Articles
+ </li>
+ </ol>
+</nav>
+
+ <article class="obj_article_details">
+ <h1 class="page_title">
+ Surveillance, stigma &amp; sociotechnical design for HIV
+ </h1>
+
+
+ <div class="row">
+ <div class="main_entry">
+
+ <ul class="item authors">
+ <li>
+ <span class="name">
+ Calvin Liang
+ </span>
+ <span class="affiliation">
+ University of Washington, Department of Human Centered Design &amp; Engineering
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0002-3795-3441" target="_blank">
+ https://orcid.org/0000-0002-3795-3441
+ </a>
+ </span>
+ </li>
+ <li>
+ <span class="name">
+ Jevan Alexander Hutson
+ </span>
+ <span class="affiliation">
+ University of Washington, School of Law
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0003-3312-1733" target="_blank">
+ https://orcid.org/0000-0003-3312-1733
+ </a>
+ </span>
+ </li>
+ <li>
+ <span class="name">
+ Os Keyes
+ </span>
+ <span class="affiliation">
+ University of Washington, Department of Human Centered Design &amp; Engineering
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0001-5196-609X" target="_blank">
+ https://orcid.org/0000-0001-5196-609X
+ </a>
+ </span>
+ </li>
+ </ul>
+
+ <div class="item doi">
+ <span class="label">
+ DOI:
+ </span>
+ <span class="value">
+ <a href="https://doi.org/10.5210/fm.v25i10.10274">
+ https://doi.org/10.5210/fm.v25i10.10274
+ </a>
+ </span>
+ </div>
+
+ <div class="item keywords">
+ <span class="label">
+ Keywords:
+ </span>
+ <span class="value">
+ HIV, online dating, design, policy, surveillance, intimacy, social computing, social justice </span>
+ </div>
+
+ <div class="item abstract">
+ <h3 class="label">Abstract</h3>
+ <p>Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work†for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate.</p>
+ </div>
+
+
+
+ <div class="item author_bios">
+ <h3 class="label">
+ Author Biographies
+ </h3>
+ <div class="sub_item">
+ <div class="label">
+ Calvin Liang, <span class="affiliation">University of Washington, Department of Human Centered Design &amp; Engineering</span>
+ </div>
+ <div class="value">
+ <p>Calvin Liang is a PhD student in Human-Centered Design and Engineering at The University of Washington. Their research broadly focuses on technology’s role in and out of queerness, health, and queer health.</p>
+ </div>
+ </div>
+ <div class="sub_item">
+ <div class="label">
+ Jevan Alexander Hutson, <span class="affiliation">University of Washington, School of Law</span>
+ </div>
+ <div class="value">
+ Jevan Hutson is a third-year law student and Gregoire Fellow at the University of Washington School of Law. He holds an M.P.S. from the Department of Information Science at Cornell University, and a B.A. from the Department of Art History and Visual Studies at Cornell University. He has been published in venues including the Association for Computing Machinery’s conferences on Computer Human Interaction and Computer Supported Cooperative Work and Social Computing
+ </div>
+ </div>
+ <div class="sub_item">
+ <div class="label">
+ Os Keyes, <span class="affiliation">University of Washington, Department of Human Centered Design &amp; Engineering</span>
+ </div>
+ <div class="value">
+ Os Keyes is a PhD student in Human-Centered Design and Engineering at the University of Washington, and an inaugural Ada Lovelace Fellow. Their research examines gender, technology and (counter)power, with a particular focus on the ways technologies of measurement shape and define queer communities.
+ </div>
+ </div>
+ </div>
+
+
+ </div><!-- .main_entry -->
+
+ <div class="entry_details">
+
+ <div class="item cover_image">
+ <div class="sub_item">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ <img src="https://firstmonday.org/ojs/public/journals/3/cover_issue_678_en_US.png" alt="“Frank Moore, Digital Divide, 2001 gouache, oil and mixed media on paper 14 3/4 x 24 1/4 inches (36,4 x 61,6 cm) sheetâ€">
+ </a>
+ </div>
+ </div>
+
+ <div class="item galleys">
+ <ul class="value galleys_links">
+ <li>
+
+
+
+
+<a class="obj_galley_link file" href="https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729">
+
+
+ HTML
+
+ </a>
+ </li>
+ </ul>
+ </div>
+
+ <div class="item published">
+ <div class="label">
+ Published
+ </div>
+ <div class="value">
+ 2020-09-10
+ </div>
+ </div>
+
+ <div class="item citation">
+ <div class="sub_item citation_display">
+ <div class="label">
+ How to Cite
+ </div>
+ <div class="value">
+ <div id="citationOutput" role="region" aria-live="polite">
+ <div class="csl-bib-body">
+ <div class="csl-entry">Liang, C., Hutson, J. A., &#38; Keyes, O. (2020). Surveillance, stigma &amp; sociotechnical design for HIV. <i>First Monday</i>, <i>25</i>(10). https://doi.org/10.5210/fm.v25i10.10274</div>
+</div>
+ </div>
+ <div class="citation_formats">
+ <button class="cmp_button citation_formats_button" aria-controls="cslCitationFormats" aria-expanded="false" data-csl-dropdown="true">
+ More Citation Formats
+ </button>
+ <div id="cslCitationFormats" class="citation_formats_list" aria-hidden="true">
+ <ul class="citation_formats_styles">
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acm-sig-proceedings?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acm-sig-proceedings?submissionId=10274&amp;return=json"
+ >
+ ACM
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acs-nano?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acs-nano?submissionId=10274&amp;return=json"
+ >
+ ACS
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/apa?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/apa?submissionId=10274&amp;return=json"
+ >
+ APA
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/associacao-brasileira-de-normas-tecnicas?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/associacao-brasileira-de-normas-tecnicas?submissionId=10274&amp;return=json"
+ >
+ ABNT
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/chicago-author-date?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/chicago-author-date?submissionId=10274&amp;return=json"
+ >
+ Chicago
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/harvard-cite-them-right?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/harvard-cite-them-right?submissionId=10274&amp;return=json"
+ >
+ Harvard
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/ieee?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/ieee?submissionId=10274&amp;return=json"
+ >
+ IEEE
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/modern-language-association?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/modern-language-association?submissionId=10274&amp;return=json"
+ >
+ MLA
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/turabian-fullnote-bibliography?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/turabian-fullnote-bibliography?submissionId=10274&amp;return=json"
+ >
+ Turabian
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/vancouver?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/vancouver?submissionId=10274&amp;return=json"
+ >
+ Vancouver
+ </a>
+ </li>
+ </ul>
+ <div class="label">
+ Download Citation
+ </div>
+ <ul class="citation_formats_styles">
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/download/ris?submissionId=10274">
+ <span class="fa fa-download"></span>
+ Endnote/Zotero/Mendeley (RIS)
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/download/bibtex?submissionId=10274">
+ <span class="fa fa-download"></span>
+ BibTeX
+ </a>
+ </li>
+ </ul>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+
+ <div class="item issue">
+ <div class="sub_item">
+ <div class="label">
+ Issue
+ </div>
+ <div class="value">
+ <a class="title" href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ Volume 25, Number 10 - 5 October 2020
+ </a>
+ </div>
+ </div>
+
+ <div class="sub_item">
+ <div class="label">
+ Section
+ </div>
+ <div class="value">
+ Articles
+ </div>
+ </div>
+ </div>
+
+
+ <div class="item copyright">
+ <p>Authors retain copyright to their work published in <em>First Monday</em>. Please see the footer of each article for details.</p>
+ </div>
+
+
+
+ </div><!-- .entry_details -->
+ </div><!-- .row -->
+
+</article>
+
+
+
+</div><!-- .page -->
+
+ </div><!-- pkp_structure_main -->
+
+ <div class="pkp_structure_sidebar left" role="complementary" aria-label="Sidebar">
+ <div class="pkp_block block_developed_by">
+ <div class="content">
+ <a href="http://pkp.sfu.ca/ojs/">
+ Open Journal Systems
+ </a>
+ </div>
+</div>
+<div class="pkp_block block_web_feed">
+ <span class="title">Current Issue</span>
+ <div class="content">
+ <ul>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/atom">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/atom.svg" alt="Atom logo">
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss2">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/rss20_logo.svg" alt="RSS2 logo">
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/rss10_logo.svg" alt="RSS1 logo">
+ </a>
+ </li>
+ </ul>
+ </div>
+</div>
+
+ </div><!-- pkp_sidebar.left -->
+ </div><!-- pkp_structure_content -->
+
+<div id="pkp_content_footer" class="pkp_structure_footer_wrapper" role="contentinfo">
+
+ <div class="pkp_structure_footer">
+
+ <div class="pkp_footer_content">
+ <p>A Great Cities Initiative of the University of Illinois at Chicago&nbsp;<a href="http://library.uic.edu/">University Library</a>.</p>
+<p>©&nbsp;<em>First Monday</em>, 1995-2020. ISSN&nbsp;1396-0466.</p>
+ </div>
+
+ <div class="pkp_brand_footer" role="complementary">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/aboutThisPublishingSystem">
+ <img alt="About this Publishing System" src="https://firstmonday.org/ojs/templates/images/ojs_brand.png">
+ </a>
+ </div>
+ </div>
+</div><!-- pkp_structure_footer_wrapper -->
+
+</div><!-- pkp_structure_page -->
+
+<script src="//ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js" type="text/javascript"></script><script src="//ajax.googleapis.com/ajax/libs/jqueryui/1.12.0/jquery-ui.min.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/lib/pkp/js/lib/jquery/plugins/jquery.tag-it.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/popper/popper.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/bootstrap/util.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/bootstrap/dropdown.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/main.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/generic/citationStyleLanguage/js/articleCitation.js" type="text/javascript"></script><script type="text/javascript">
+(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
+
+ga('create', 'UA-41314203-1', 'auto');
+ga('send', 'pageview');
+</script>
+
+
+</body>
+</html>
diff --git a/python/tests/files/genders_g58_fairlie.html b/python/tests/files/genders_g58_fairlie.html
new file mode 100644
index 0000000..49cada8
--- /dev/null
+++ b/python/tests/files/genders_g58_fairlie.html
@@ -0,0 +1,146 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+<head>
+<title>Genders OnLine Journal - Genders OnLine Journal - Presenting innovative theories in art, literature, history, music, TV and film.</title>
+<meta name="description" content="Analysis of Hitchcock’s Rope (1948) as a critique of heteromasculinity that thematizes queer anguish, orality, and women’s relationship to the covert world of homosexual knowledge.">
+<meta name="keywords" content="homosexuality, homophobia, Cold War, the closet, heteromasculinity, queer anguish, anus, suspicion, orality, eating, cannibalism, Catholicism, knowledge, the cinematic cut, cinematic reality, women in Hitchcock, women and gay men, lack, hypocrisy, straight male interlocutor.">
+<style type="text/css">
+<!--
+
+td {
+ font-family: Arial, Helvetica, sans-serif;
+ font-size: 13px;
+}
+
+.Section1 {
+ page:Section1;
+}
+-->
+</style>
+</head>
+<body alink="#000088" background="../image/back.jpg" vlink="#00aa00">
+<p>
+<table width="600">
+ <tbody>
+ <tr>
+ <td valign="top" width="90"><p><img src="../image/indlgo.gif" alt="Genders OnLine Journal" align="bottom" border="0" height="530" width="97"> </p></td>
+ <td align="right" valign="top" width="530"><table width="530">
+ <tbody>
+ <tr>
+ <td valign="top"><p><b><font size="2">Issue 58</font></b>, Fall 2013</p>
+ <p><font size="5"><strong>Reading Maeshowe</strong></font> <br>
+ Recovering the Feminine in a Neolithic Tomb</p>
+<p>By <strong>CHARLOTTE FAIRLIE</strong></p>
+ <p>[1] Cuween, a small Neolithic cairn, perches on top of a hill on the Orkney Mainland. A flashlight waits in a bucket by the door, and visitors crawl on hands and knees, one by one, into the pitch-black interior. After savoring a degree of darkness rare in modern life, they direct beams of light up the tapering walls to marvel at the skill of the stonemasons. It is impossible to resist the impulse to clamber into the chambers and crouch where the bones once lay. Green and smooth, Maeshowe, another Orkney cairn, rises enigmatically from the field where it has stood since around 2700 BC. The designation of this monument and the surrounding Neolithic structures as a UNESCO World Heritage Site (WHS) in 1999 significantly increased tourism to the area (Card et al. 429), so while visitors may still enter Cuween unsupervised, access to the much larger Maeshowe now requires a timed ticket, bought in advance. Throughout the year, thousands of visitors, bending uncomfortably low, shuffle through the tunnel-like passage entry, making the physical journey from light to dark and a more psychological journey from present to past. Exploring any of the Neolithic sites in Orkney is to bridge time, to feel kinship with those who built them.</p>
+ <p>[2] Without doubt, a major reason Maeshowe attracts so many people is its symbiotic relationship with its environment. Most famously, at sundown during the December solstice, the winter sun lines up with the door of the tomb, shines down the passage, and focuses its rays on the stone wall within. Interest in this phenomenon, the moment when the light stabs the darkness, is so high that Historic Scotland provides web-cam coverage, but Maeshowe fascinates others besides tourists and solstice celebrants. Whether they are vacation visitors, archaeologists, anthropologists, or poets, explorers experience the sites differently, applying their own intellectual tools and imagining Neolithic lives from their respective points of view. Leslie Riddoch has written that these are &ldquo;Stone Age marvels which inspire and astonish,&rdquo; and Simon W. Hall expresses the experiences of many when he refers to &ldquo;the profound impact of entering a tomb&rdquo; (160). They imply that to enter a cairn is to become one with it, to undergo a transformation. Maeshowe, which can now be experienced only under the regimented conditions required by the Historic Scotland guides, clearly retains extraordinary power to inspire. Indeed, this ancient mound has attracted a great deal of literary attention from both noted and obscure writers. Considering these cumulative interpretations, rather than relying solely on the work of archaeologists, opens up a more comprehensive, textured, and, indeed, gendered understanding of ancient history and our commonality with Neolithic peoples.</p>
+ <p> [3] George Mackay Brown, Kathleen Jamie, Myra Schneider, and Dilys Rose are four of the more prominent authors for whom Maeshowe has proven inspirational. They have experienced the tomb through a doubly imaginative process: first by reading it as they would read a poem and then by expressing that interpretation in writing. While Brown was an Orcadian, living most of his life alongside the Neolithic sites, Jamie, Schneider, and Rose, all of whom have Scottish roots, experience Maeshowe as tourists, drawn across the Pentland Firth to enter the passage and travel into the darkness. Significantly, all three of these more contemporary writers are women. Hall, in his valuable survey, <u>The History of Orkney Literature</u>, contrasts the use of the prehistoric by female Scottish writers with that of their male counterparts, stating that it is less political, that women authors take &ldquo;the opportunity to reestablish the place&mdash;and, significantly, the inner lives of women in the prehistoric or early historical northern landscape&rdquo; (162-163). I would argue, however, that their work also engages the public world to a greater extent and is more ideological than this statement implies. Jamie&rsquo;s, Schneider&rsquo;s, and Rose&rsquo;s experiences in Maeshowe lead to readings of the monument that build on the archaeological interpretations, allowing us to consider the possibility of ancient gender power struggles and raising our awareness of the deep roots of masculine dominance.</p>
+ <p>[4] Archaeologist Colin Richards, who has written extensively about The Heart of Neolithic Orkney WHS, describes how visiting cairns must also have affected prehistoric visitors: &ldquo;the journey will be one of consequence.&rdquo; Moving from the light of day to the dark mysteries of a tomb&rsquo;s interior &ldquo;is a passage from the profane to the sacred.&rdquo; As such, &ldquo;it will involve transformation&rdquo; (&ldquo;Doorways&rdquo; 70-71). However, the nature of the transformation is mysterious. Referring to single-chambered structures divided into stalls, he continues, &ldquo;If the Orkney-Cromarty &lsquo;chambered&rsquo; tombs are principally conceived as a series of doorways, the question arises: where are they leading? To what goal?&rdquo; (71). In discussing the relationship between buildings and the people who used them thousands of years ago, Richards considers the figurative significance of doors. In doing so, he treats the tombs as if they were literary texts with debatable meaning, having previously pointed out that &ldquo;the architecture of a chambered tomb relied on analogy and metaphor for its understanding and interpretation&rdquo; (&ldquo;Doorways&rdquo; 67). Rather than merely being repositories for bones, the tombs, Richards asserts, were &ldquo;built to be experienced visually, physically and imaginatively,&rdquo; an experience which may well result in some kind of &ldquo;revelation&rdquo; (&ldquo;Doorways.&rdquo; 69, 70, 76). Since he argues that buildings carry metaphoric meaning, open to imaginative interpretation, it is entirely appropriate that, when explaining this, Richards also changes to the historical present tense. His grammatical shift emphasizes that like <u>Beowulf</u>, <u>Hamlet</u>, or <u>Moby Dick</u>, tombs such as Maeshowe transcend time and are open to new readings, whether by trained archaeologists, pilgrims, casual visitors, or writers.</p>
+ <p>[5] Robert Crawford draws more explicit parallels between Maeshowe itself and literature in his essay, &ldquo;Maes Howe Sappho.&rdquo; Noting the continuing appeal of the tomb, how today &ldquo;people still treasure&rdquo; the moment that the sun lines up with the passage, he compares the ancient monument to poetry:</p><blockquote>However different we and our family groups, our tribes, have become, we can and do still savor that sense of alignment and attunement and have our own ways of articulating some sort of consonance between ourselves, our intimate groupings, and the universe that surrounds us. Though such patternings may be deconstructed, they seem to emerge from a deep need that recurs across generations, like a persistent internal rhyme, and poetry, this most nuanced way of making with words, is a way in which that need for attunement is repeatedly articulated through language. If prehistoric sites often appear to relate people to the stars and planets, then poems continue that impulse. (61)
+ </blockquote>
+ <p>Ancient tombs, then, prompt us to ponder our place in the universe, our identity as humans, and in that also they resemble literature. According to Kenneth Brophy, Neolithic monuments &ldquo;were and are locations that embodied the biography of the builders, users, spectators, and excavators&rdquo; (10). It follows that if we think of Maeshowe as a text, Brophy&rsquo;s assertion that the monument absorbs the &ldquo;biography&rdquo; of all who have used it or visited it, positions it as an example of intertextuality. Maeshowe has many constantly changing stories to tell to its different readers, and readers will respond differently to its figurative meanings.</p>
+ <p>[6] In a 1977 column for <u>The Orcadian</u> newspaper, George Mackay Brown describes how witnessing the midwinter solstice at Maeshowe affects him: &ldquo;Winter after winter I never cease to wonder at the way primitive man arranged, in hewn stone, such powerful symbolism&rdquo; (&ldquo;Maeshowe at Midwinter&rdquo; 88). Like Richards, Brown is emphasizing the figurative qualities of the structure, which he has further explored in poetry. However, the first of his 1999 &ldquo;Two Maeshowe Poems&rdquo; (often printed as a stand-alone) opens not at the tomb, but with an image of the neighboring stone circle, Brodgar. Perhaps surprising to most readers, this would resonate with archaeologists since current scholarship emphasizes that the sites comprising The Heart of Neolithic Orkney are not self-contained but exist and function in relation to one another and to the surrounding landscape (See &ldquo;Heart of Neolithic Orkney WHS: Setting Project&rdquo; 5). As such, they should not be interpreted as discrete entities. It is fitting, then, that Brown&rsquo;s poem moves seamlessly through a series of images that integrate Brodgar&rsquo;s &ldquo;light and darkness&rdquo; with Maeshowe&rsquo;s &ldquo;flowers [and] stone&rdquo; (a reference to the runic graffiti carved by Vikings inside the tomb) and &ldquo;skulls&rdquo; (Lines 1, 9, 11). The first word of the poem, &ldquo;Circle,&rdquo; is semantically echoed in the initial word of each ensuing stanza, &ldquo;Ring,&rdquo; &ldquo;Wheel,&rdquo; and &ldquo;Round,&rdquo; subtly shifting from the geometrically circular Brodgar to the tumescent mound of Maeshowe and emphasizing the cycle of &ldquo;life and death&rdquo; (7). For this is a poem about regeneration, how &ldquo;Out of those skulls / Breaks the first green shoot, the full ear, then the bread&rdquo; (11-12). Throughout, juxtaposed images look for the positive to outweigh the negative: &ldquo;We move in shadows,&rdquo; but &ldquo;Brodgar has burned on the moor a dance of sun&rdquo;; &ldquo;Ring of quern and plough&rdquo; (a quern is a stone for grinding grain) are charged to &ldquo;contain / Our tumults of blood&rdquo;; &ldquo;The stars&rsquo; chaos is caught in a strict rein&rdquo;; the word &ldquo;stone&rdquo; is enveloped by &ldquo;flowers,&rdquo; and &ldquo;beauty and love&rdquo;; similarly, &ldquo;snow&rdquo; is flanked by &ldquo;sun&rdquo; and &ldquo;seed.&rdquo; So darkness becomes light, destructive violence is subservient to the raising and grinding of grain for bread, order makes sense of the universe, the beautiful and the warm temper the hard and the cold, and new life will follow death.</p>
+ <p>[7] Brown&rsquo;s interpretation of these monuments, his use of the architectural circularity and roundness of the Ring of Brodgar and Maeshowe as metaphors for the lifecycle and the possibility of renewal, is shared by archaeologists, who despite its being a burial site, have also associated Maeshowe and its rituals with the agricultural year. Neolithic people were not nomadic but had gradually become settled farmers, living by the routines and rhythms of the seasons, which, according to Richards, constituted &ldquo;an analogy with the human life cycle and past generations&rdquo; (&ldquo;Doorways&rdquo; 65). Time&rsquo;s passage was the organizational framework for survival as well as mortality, and the tombs, he writes, were &ldquo;a metaphorical extension of daily life&rdquo; (&ldquo;Doorways&rdquo; 76). Trevor Garnham, an architect, develops that idea further: &ldquo;Burying bones in the earth was perhaps to seek some metaphoric relationship with the planting of seeds. In its maturity and death, the seed containing the essence of its own renewal served as the inspiration for the hope of life&rsquo;s rebirth in some other form&rdquo; (87). In pairing skeletal remains with seeds as an expression of hope for the future, Garnham&rsquo;s analogy is comparable to the positive final image of Brown&rsquo;s poem, the &ldquo;skulls&rdquo; engendering the &ldquo;green shoots&rdquo; and the &ldquo;bread&rdquo; of life.</p>
+ <p>[8] Brown had written earlier of Maeshowe in his 1996 poem, &ldquo;Maeshowe: Midwinter,&rdquo; choosing then to focus on the solstice. However, the imagery here is not rooted in the agricultural cycle, the earthly world of querns, ploughs, and bread; instead, he connects the pre-Christian tomb to the Christian calendar. The opening phrase, &ldquo;Equinox to Hallowmass,&rdquo; immediately integrates the astronomical with the sacred, giving the season of &ldquo;darkness&rdquo; both physical and spiritual dimensions (1). The religious imagery continues in the second stanza as it evokes &ldquo;St Lucy,&rdquo; whose feast day falls on the shortest day of the year (6). She is portrayed as a weaver whose &ldquo;shuttle&rdquo; creates &ldquo;a dark web&rdquo; that &ldquo;fills the loom&rdquo; (7-9), placing at the centre of the poem a world in which light is completely absent: &ldquo;The blackness is solid as a / stone that locks a tomb. / No star shines there&rdquo; (10-12). To be in such a void, with no guiding star, would seem like a moment of psychological despair, yet just as the days begin to lengthen immediately after the solstice, the poem also brightens. The moment when the sun enters the passage is the &ldquo;true ceremony,&rdquo; suggesting that perhaps the pagan reverence for nature carries particular authenticity. Then &ldquo;the last fleeting solstice flame&rdquo; is &ldquo;caught up,&rdquo; leading to an optimistic note as the children&mdash;the future&mdash;sing with &ldquo;voices like leaves of light&rdquo; (19). Again, the poem ends with an image of rebirth, but its tone is less biological and more cosmological.</p>
+ <p>[9] While Brown&rsquo;s poems use these dual frames of reference in order to explore the themes of regeneration that Maeshowe expresses, the biological and cosmological are not at odds. Garnham defines the cosmos as &ldquo;an all-encompassing world of things and phenomena [. . . .] The essential character of this early form of cosmos bound every aspect of a people&rsquo;s life into reciprocal relationships with the forces that give shape to their world&rdquo; (9). The central argument of his book places Neolithic Orkney in this context. Similarly, reading Brown&rsquo;s two Maeshowe poems together reveals that the &ldquo;green shoot&rdquo; which produces the &ldquo;bread&rdquo; corresponds to the youthful &ldquo;voices like leaves of light.&rdquo; In fact, his insertion of &ldquo;leaves,&rdquo; with its agrarian connotations, into that final line establishes the connection, recognizes that the complex architectural system of domestic houses, burial chambers, and stone circles symbolizes the idea that the activities for which they were designed&mdash;working, eating, loving, sleeping, worshipping, dying, and the possibility of rebirth&mdash;are the web of human existence. The physical bread and the metaphysical song are one.</p>
+ <p>[10] In their respective responses to Maeshowe, Kathleen Jamie, Myra Schneider, and Dilys Rose also address the theme of the cycle of life and death. Jamie&rsquo;s essay, &ldquo;Darkness and Light,&rdquo; describes a quest: she seeks a good, positive darkness because, in the 21st century, it has become impossible &ldquo;to see the real dark for the metaphorical dark . . .the death-dark.&rdquo; Enjoyment of the &ldquo;natural, courteous dark,&rdquo; she has come to believe, has been squeezed out by the Christian belief in a metaphorical darkness that stands for the opposite of salvation (9-10). However, as she is planning this trip, a friend points out that &ldquo;Maes Howe is a metaphor,&rdquo; perhaps exposing a flaw in Jamie&rsquo;s thinking: possibly the natural and metaphorical darknesses are inseparable (10 emphasis added). Although her visit to Maeshowe takes place a couple of days before the solstice, the artificial lights of a surveyor&rsquo;s crew assault her eyes, so she rediscovers no &ldquo;courteous darkness&rdquo; and witnesses &ldquo;no resurrecting beam of sunlight&rdquo; (19). Nevertheless, through Maeshowe, she becomes reconciled to the conventional negative concept of darkness. In terms of &ldquo;wonder&rdquo; similar to Brown&rsquo;s in <u>The Orcadian</u>, she asks, &ldquo;Were they the first people . . . to articulate this metaphor of light and dark, of life and death?&rdquo; and reflects upon its significance:</p><blockquote>For five thousand years we have used darkness as the metaphor of our mortality. We were at the mercy of merciless death, which is darkness. When we died, they sent a beam of midwinter light in among our bones. What a tender, potent gesture. In the Christian era, we were laid in our graves to face the rising sun. We&rsquo;re still mortal, still don&rsquo;t want to die, don&rsquo;t want our loved ones to die. (19-20)
+ </blockquote>
+ <p>Her rejection of a metaphor that she has considered &ldquo;[worn] out&rdquo; and &ldquo;redundant&rdquo; (4, 9) turns out to have been less literary and more personally psychological, for Jamie&rsquo;s visit to the tomb leads to her acceptance of mortality. Whereas previously she has blamed Christianity, she now appreciates that the Christian concept of darkness is part of a continuum of dread traceable back to Neolithic times and forward to our own. The &ldquo;tender, potent gesture&rdquo; of the light penetrating the dark of the tomb, therefore, offers consolation, ameliorating our most profound fears (20).</p>
+ <p>[11] In her poem, &ldquo;Maeshowe,&rdquo; Myra Schneider also describes a guided tour of the cairn, during which the speaker uses the second person singular to address a hypothetical visitor, initially giving the sense that to enter the burial place feels like death as the &ldquo;chill seeps into your body&rdquo; (14). However, this ominous impression is immediately dismissed because &ldquo;a stillness that&rsquo;s other than death inhabits / this place where the undead gather to greet the dead&rdquo; (15-17). The journey through the passage will take &ldquo;you&rdquo; to a place that is not oblivion but, instead, is where the living may consort with their ancestors. Again, the boundary between life and death, which can seem so irrevocable, becomes less absolute and, therefore, less threatening. After the visit is over, its impact will remain, and the speaker imagines her visitor&rsquo;s memories:</p><blockquote>In midwinter you&rsquo;ll visualize the sun piercing the dark that swaddles seeds, see it falling on the aligned entrance, its white shine splitting to burnish the passage wall, flood the ground with gold. (22-26)
+ </blockquote>
+ <p>These images recall Garnham&rsquo;s theory: that the burial of bones is connected metaphorically to the planting of seeds. In the speaker&rsquo;s memory, the dark cradles seeds, the germ of life, rather than bones. Once sunlight enters the tomb, a radiant moment occurs in which the &ldquo;ground&rdquo; will turn &ldquo;gold,&rdquo; like a field of ripe grain. Schneider&rsquo;s poem, like Brown&rsquo;s, affirms the archaeological reading of Maeshowe as a place of renewal, but in this case that renewal goes beyond the promise of the agricultural cycle. An individual will be able to experience, perhaps during times of psychological or spiritual gloom, the moment of glory when the sun is &ldquo;piercing / the dark.&rdquo; There is a Romantic quality to these lines: Maeshowe will stay with Schneider&rsquo;s speaker as those daffodils stay with Wordsworth, &ldquo;to flash upon the inward eye / That is the bliss of solitude,&rdquo; to stimulate the imagination (24). Having herself benefited from the tomb&rsquo;s restorative qualities, the speaker is inspired to spread the word, to share her revelation with &ldquo;you,&rdquo; the reader.</p>
+ <p>[12] Besides the drama of the solstice, another inspirational feature of Maeshowe is the Viking runes carved on the interior walls. Referring to these inscriptions as &ldquo;The first island poems,&rdquo; Brown quotes them emphatically in the second of the paired poems: &ldquo;INGIBIORG IS THE LOVELIEST GIRL / HERMUND WITH A HARD AXE CARVED RUNES&rdquo; (&ldquo;Two&rdquo; 13, 18-19). Many have been struck by the simple humanity of these statements, as well as the paradox inherent in this lusty youthful scrawling being hidden in a tomb. Dilys Rose, in &ldquo;Maeshowe Nipple,&rdquo; for instance, lists the prosaic concerns of the Vikings, portraying them as &ldquo;intrepid&rdquo; but also homesick, missing &ldquo;sweethearts and family&rdquo; (4, 9). At the ends of their respective poems, both Brown and Rose emphasize that Maeshowe was merely a temporary shelter for the Vikings: the &ldquo;young seamen climbed out of Maeshowe, / Their nostrils wide to the salt wind&rdquo;; &ldquo;the dragon boats moved on&rdquo; (Brown &ldquo;Two&rdquo; 23-24; Rose 11). Crawling out of the subterranean tomb and heading for further maritime adventures, the men re-enter the world, extending the overall theme of regeneration. Brown, as we have seen, has already linked the tomb with the life-giving promise of &ldquo;the first green shoot, the full ear, then the bread&rdquo; in the first of these paired poems. Rose, in similar terms, also connects the Viking runes with the reassuring knowledge that there will be a crop next year: over the centuries, &ldquo;their tongue / took root and sprouted from invaded soil / green words for <u>Father</u>, <u>Daughter</u>, <u>Bread</u>&rdquo; (11-13). Here, in the final lines, the Viking vocabulary is fresh and verdant, a harbinger of new human life and the grain that nourishes it. Since runic characters are &ldquo;straight-branched&rdquo; (Rose 4), they resemble rows of rudimentary skeletal stick figures which have been buried in the tomb. The bony runes, therefore, have become metaphorical seeds, and Rose&rsquo;s speaker, like Garnham, sees hope in the bone/seed analogy.</p>
+ <p>[13] It is clear, to summarize briefly, that these four creative writers read Maeshowe much as archaeologists and historians of architecture have done, as an expression of hope for the future, particularly in relation to the coming of spring, but also at a more personal level. The texts suggest that to visit these tombs is, as Richards also emphasizes, transformative. Like their ancestors, contemporary visitors are changed, in some manner revitalized, especially if they witness the sun&rsquo;s midwinter alignment, which Brown describes as a &ldquo;pledge of renewal, a cry of resurrection&rdquo; (&ldquo;Maeshowe in Midwinter&rdquo; 88). However, in the work of Jamie, Schneider, and Rose, a further, more political restoration is at work, for all three use images equating Maeshowe with the female body.</p>
+ <p>[14] Kathleen Jamie states early in her essay, &ldquo;We are conceived and carried in the darkness,&rdquo; emphasizing the positive, life-giving qualities of the dark, and inviting the reader to see Maeshowe as a uterus (4). The womb/tomb imagery is developed further when she eroticizes the winter solstice as &ldquo;a complicit kiss,&rdquo; during which &ldquo;the beam of the setting sun shines along the passage, and onto the tomb&rsquo;s back wall&rdquo; (12). When she goes inside the tomb, she expects &ldquo;not utter darkness, but perhaps a wombish red&rdquo;; however, this is denied her because of the lights of the surveyors, one of whom is &ldquo;folded, foetus-like, into the little cell in the back wall&rdquo;: a foetus implanted in the very place where the sunbeam strikes (12,13). When Jamie leaves, she describes taking &ldquo;the smallest and most challenging of journeys, squeezing down a passageway and out into the world of sound and moving air&rdquo; (17). The tunnel that admits the beam has become a birth canal, so Jamie&rsquo;s transformation is not only her intellectual reassessment of the metaphorical value of darkness; she visualizes her own rebirth in more literal terms too, with Maeshowe cast as the mother.</p>
+ <p>[15] Myra Schneider&rsquo;s &ldquo;Maeshowe&rdquo; also hints that to visit the tomb is to return to the womb when the speaker remarks that although &ldquo;you&rdquo; are part of a tour group, you will realize that you are &ldquo;alone&rdquo; and have &ldquo;never travelled so far back / so far in&rdquo; (8-10). This analogy is made more explicit later in the poem when the sun enters the passage: &ldquo;In that deep chamber / you&rsquo;ll be bathed in red, not the red spilt in hatred&mdash;/the red that&rsquo;s birth, the heart looming with the blood&rdquo; (24-28). In the vision that the speaker evokes for the visitor&rsquo;s memory, therefore, the &ldquo;dark that swaddles seeds&rdquo; not only nurtures and protects the grain that will ripen into crops, but also the fertilized ovum (23). With no dazzling and intrusive surveyors&rsquo; lights, Schneider suggests that it is possible for us to experience the &ldquo;wombish red&rdquo; that was denied Jamie, blood that is the force of life rather than the mark of violence.</p>
+ <p>[16] Dilys Rose&rsquo;s poem, &ldquo;Maeshowe Nipple,&rdquo; on the other hand, in addressing the Viking use of the tomb, acknowledges that violence has taken place. The title, of course, immediately signals that Maeshowe is female, and the opening lines graphically describe the tomb&rsquo;s external anatomy: a &ldquo;breast,&rdquo; with an &ldquo;aureola / sandy-rimmed, the nipple leaking a pale trail / to hidden chambers&rdquo; (1-3). Within, Maeshowe&rsquo;s chambers have been &ldquo;invaded&rdquo; by men who &ldquo;inscribed their conquests&rdquo; and &ldquo;totted up the loot&rdquo; (12, 4, 6). Even though the poem has initially compared the cairn to a breast rather than a womb, this seems like a rape or an assault by men exercising their power and keeping track of their plunder. As human and homesick as the poem presents the young men, it does not forget that their presence in Maeshowe is as uninvited intruders who leave their runic seeds carved into the chamber walls.</p>
+ <p>[17] To make sense of this pattern of imagery, it is helpful to turn to an earlier female author, similarly inspired by her visit to a Neolithic site. Naomi Mitchison wrote <u>Early in Orcadia</u> after a friend took her to another of Orkney&rsquo;s chambered tombs, Isbister, which has no passage entry, because &ldquo;she knew it would waken something in me&rdquo; (8). Set in Neolithic times, the novel follows a family and its descendants as they settle on Orkney, establish homes and villages, and erect the monuments in which they practice their religious rituals. Mitchison depicts the cairns predating the stone circles (both Isbister and Maeshowe are, in fact, thought to have been built before Brodgar) and imaginatively describes the changing beliefs prompting these architectural developments. Tradition holds that pregnant women must visit the tomb in order that the ancestral spirit will be passed to their children (132). One woman, Ba, making this journey, reflects that a &ldquo;few moons&rdquo; have passed since she became pregnant and stopped menstruating. She also knows that a powerful goddess, &ldquo;the big bad Moon Woman had once had an honouring place,&rdquo; had watched over the dead (119). However, the Moon Woman has been supplanted by the sun. The burial place was &ldquo;pulled apart and scattered by the Sun Man and the bulls. After that came the beginning of their own honouring place where the bones lay and where you must go down on your knees before you could get in&rdquo; (119). The later passage cairn, then, is a creation of the masculine sun, the same sun that shines down the passageway at midwinter. Accompanied by bulls, also male, the Sun Man has ravaged the Moon Woman&rsquo;s tomb and designed a new one to suit his own needs. Even so, the burial place is still associated with female fertility. Nervously, Ba enters &ldquo;on her hands and knees . . . under and between great stones.&rdquo; Once inside, though, she thinks of the moments before she conceived her child: &ldquo;She was waiting, almost as she had waited in the soft sand behind that rock in the sun-warmed geo a few moons back&rdquo; (130). For Ba, the tomb is not frightening. She recalls not a violent rape, but a loving encounter, and the darkness feels as warm as the &ldquo;geo&rdquo; (an Orcadian word referring to a deep, narrow fissure in a cliff) where she met her lover. Following her memory of the moment of conception, she is &ldquo;push[ed] . . . back, back to the way out, back to the square of light, to the way out into the real world on hands and knees as one must&rdquo; (130). Like Jamie, Ba is compelled to crawl, to battle her way through the passage to be reborn.</p>
+ <p>[18] By the end of <u>Early in Orcadia</u>, the stone circle, with its emphasis on light rather than dark, is becoming the ultimate manifestation of the transfer of power from the Moon Woman to the Sun Man. Its significance is explained by the &ldquo;Great Man,&rdquo; who is &ldquo;painted with sun circles,&rdquo; to Moon Woman after he has summoned her to his presence: &ldquo;The great tall stones . . . were so raised to show the way of the sun, who is our master and our maker&rdquo; (169). Moon Woman, however, is aware of the injustice of this arrangement: &ldquo;They said that the moon was the servant of the sun, to do what he wanted, but that, Moon Woman knew, was not right. In her own mind she unsaid it&rdquo; (170). At first she is jealous and afraid, but the final vision of the novel is hers, and it is, to an extent, a reconciliation of powers:</p><blockquote>If I were to say a few small and easy words to the Great Man, if I were to move myself in a certain way, then we would be sun and moon. Then I would put my fingers onto the colour, onto that knife, onto his eyes, . . . eyes, onto that round, shining sun that hangs over his heart, fingering it so that my fingers would meet his, me going . . . onto all parts of him. He would be mine as the sun is the moon&rsquo;s. (176)
+ </blockquote>
+ <p>She is picturing an intertwining of sun and moon, of masculine and feminine&mdash;a consummation. The partnership is not one of complete equality, though, for she also envisions not that the sun will be the master and the moon the servant, but that he will be hers, that the moon will possess the sun, that her status will be restored.</p>
+ <p>[19] Mitchison&rsquo;s fictional representation of light/sun/man emerging as the object of worship and awe, assuming the rank previously held by dark/moon/woman, is an idea rooted across cultures: &ldquo;A fundamental polarity in many creation myths,&rdquo; according to Trevor Garnham, &ldquo;contrasts the dark, fecund, harbouring earth with the up-drawing sun.&rdquo; (145). He points out, for example, that &ldquo;by the time of the Celtic occupation of Britain, there were well-established beliefs and practices focused on the sun&rdquo; and that in Norse mythology, &ldquo;a male hierarchy supplanted older, matriarchal law&rdquo; (161, 109). Analyzing the archaeological sites within this paradigm, Garnham argues, supports the theory that religious practice fundamentally changed along with the architecture, that &ldquo;ritual activity associated with burial cairns became transferred to stone circles&rdquo; (152).</p>
+ <p>[20] Maeshowe, however, suggests a mid-point in this ritualistic shift because although, like earlier stalled cairns, it is dark and womb-like, its annual climactic moment is when the sun lights up the passage. Garnham sees the Neolithic architecture of Orkney as a progression. The first structures, the houses, were purely domestic; they had a &ldquo;nurturing role&rdquo; (66). The houses at the coastal village site, Scara Brae, therefore, &ldquo;seem to be fundamentally powerful symbols of protection and gathering, echoing that of the pot and the basket&rdquo; (70). Since the manufacture of both pots and baskets was the work of women, Garnham is reading the houses as essentially feminine. They were vessels, their stone walls embanked by earth. Both Garnham and Richards point out that the houses were models for the tombs: the passage graves are structurally similar to the houses at Scara Brae, and both were covered with turf (Garnham 48; Challands, Muir &amp; Richards 242, 245). Cairns of the Maeshow type, with passage entries, however, were the later forms. The earlier stalled structures, such as Midhowe, on the island of Rousay, did not feature the tunnel entrance.</p>
+ <p>[21] Archaeologists do not agree on the social significance of passage cairns and sun circles, the extent to which their development reveals a move to a more hierarchical society. Challands, Muir, and Richards state, &ldquo;In many ways, everything about the architecture of Maeshowe enforces a notion of separation, division, and restriction&rdquo; (247). Elsewhere, Richards and another co-writer are more guarded. They point out that the tomb resembles House 2 at the nearby Barnhouse settlement, a larger house than any at Scara Brae that was probably &ldquo;highly restricted on the basis of an individual&rsquo;s status, probably additionally defined in terms of age and gender.&rdquo; However, they also warn that there is insufficient archaeological evidence to &ldquo;leap to conclusions about a patriarchal group of &lsquo;elders&rsquo; who used knowledge as a commodity to maintain their power over women and younger men&rdquo; (Muir &amp; Richards 204). Although cautious, they do acknowledge that &ldquo;power and authority,&rdquo; probably based on &ldquo;cosmological beliefs,&rdquo; would have been necessary to build the monuments (199). Leaning not only on physical but also anthropological evidence, Garnham&rsquo;s view, on the other hand, is that the more formal structure <u>does</u> support the idea of hierarchy and that the estimated 100,000 man/hours that would have been necessary to build it point to a more complex social structure that had to extend beyond the local community (128). Furthermore, he writes, the layout of individual chambers &ldquo;can be read as a metaphor of primogeniture&rdquo; (74). Like Richards, Garnham interprets the passage as a symbol of privilege because it was hard to get inside. However, citing Eliade&rsquo;s <u>Patterns in Comparative Religion</u>, he also emphasizes that there is &ldquo;a close connection between solar theology and the elite&rdquo; (163). In this context it seems that &ldquo;allowing access to the sun . . . was more important that [sic] allowing access to members of the tribe&rdquo; (131-132).</p>
+ <p>[22] Maeshowe can be seen, then, as expressing a point of tension between earth and sun in which the dark tomb is literally infiltrated by solar rays on one day only. The subsequent building of the Circle of Brodgar elevates the stature of the sun. Fully above ground, the center of its astronomical and religious year occurs not in December, but in June, at the midsummer solstice. Garnham points out that while a smaller circle, the Stones of Stenness, is open to the sun at its &ldquo;point of maximum power,&rdquo; Maeshowe allows the sun inside only when it is &ldquo;at its lowest ebb.&rdquo; Except at midwinter, &ldquo;the tomb is dark, cold, and filled with white bones, echoing the whiteness of the moon&rdquo; (207). Although Stenness actually predates Maeshowe by perhaps 400 years, throwing off the neat chronology of <u>Early in Orcadia</u>, Garnham&rsquo;s interpretation of Maeshowe and the stone circles parallels Mitchison&rsquo;s literary response to the Isbister tomb: compared to earlier cairns, Maeshowe is a more patriarchal development, the passageway allowing the masculine sun to displace the feminine &ldquo;whiteness of the moon,&rdquo; and yet the bones, the metaphorical seeds, still lie dormant; the presence of Moon Woman endures.</p>
+ <p>[23] Although <u>Early in Orcadia</u> ends with Moon Woman&rsquo;s vision of a mingling of sun and moon, of masculine and feminine, there is a note of uncertainty as she asks herself, &ldquo;Should I, then?&rdquo; (176). She does not ask &ldquo;Can I?&rdquo; but &ldquo;Should I?&rdquo; Her question is not whether she is personally capable, but whether it would be wise to challenge the elite power structure in the name of justice. Readers are left without an answer, but since women are still fighting for equality in the institutions of politics and religion, it is reasonable to assume that if Moon Woman did attempt it, she met with a great deal of resistance. It is with this in mind, then, that we can return to the Maeshowe experiences of Jamie, Schneider and Rose. Their visits to the cairn suggest that to see it merely as a symbol of agricultural regeneration or even more broadly of hope, is incomplete. Something more needs to be resurrected, and their use of the female imagery effectively acknowledges and reclaims a feminine narrative for Maeshowe. In Rose&rsquo;s poem, 12th century Vikings may take up residence inside, but 900 years later, the reader is instructed to &ldquo;See,&rdquo; to bear witness to &ldquo;a green breast in a green field,&rdquo; the most nurturing part of a woman&rsquo;s body surrounded by the new growth of spring (1). When Schneider refers to the &ldquo;red that&rsquo;s birth&rdquo; rather than the &ldquo;red spilt in hatred,&rdquo; and describes how the sun will &ldquo;burnish the passage wall, / flood the ground with gold&rdquo; and, similarly, when Jamie refers to the &ldquo;complicit kiss,&rdquo; it is as if Moon Woman&rsquo;s consummation has finally taken place and justice restored.</p>
+ <p>[24] Richards asks where the doors of tombs lead, to what &ldquo;revelation.&rdquo; Indeed, the creative writing of Jamie, Schneider, and Rose transports readers through Maeshowe&rsquo;s entryway towards &ldquo;revelation.&rdquo; Their collective responses help us to recognize the humanity of Neolithic peoples, to appreciate how common experiences connect us to the past. They ask us to consider the roots of sexual discrimination, the possible marginalization of women 5000 years ago. More universally, they honor the memory of displaced matriarchal societies and, thus, prompt us to reflect on the status of women today. While, as Hall points out, male authors of the mid-twentieth-century Scottish Literary Renaissance had a nationalist political agenda, &ldquo;looking for Scotland in Scotland&rsquo;s prehistory&rdquo; (160), these female writers look to the past for a feminist renewal, both personal and political. As such, their interpretations complement and illuminate those of archaeologists. Naomi Mitchison, acknowledging that she may be &ldquo;treading on the toes of archaeologists,&rdquo; points out that their physical &ldquo;evidence may not always offer a clear interpretation, in fact it very seldom does&rdquo; (113). For despite their painstaking sifting (both literal and figurative) of physical evidence, archaeologists must, finally, apply their own imaginations.</p>
+ <p>[25] Archaeologists themselves recognize the uncertainty inherent in drawing conclusions about ancient societies from the surviving fragments of their lives. In reference to the recent discovery of a complex of temples at the Ness of Brodgar, Richards has said, &ldquo;This was a ceremonial centre, and a vast one at that. But the religious beliefs of its builders remain a mystery&quot; (qtd. in McKie). In fact, the excavation of this temple complex is prompting a reassessment of the entire Heart of Neolithic Orkney. Tom Muir, of the Orkney Museum, goes so far as to assert that &quot;the whole text book of British archaeology for this period will have to be torn up and rewritten from scratch thanks to this place&quot; (qtd. in McKie). Even as archaeologists, using sophisticated technology, scrape away the dust of time from this long-buried site, it remains true that &ldquo;Insights can only come from interpretation&rdquo; (Jones and Richards 195). It is in this interpretative arena that science must join forces with the arts and humanities in the search for knowledge, for a fuller understanding.</p>
+ <p>[26] George Mackay Brown has written, &ldquo;People in 2000 AD are essentially the same as the stone-breakers [. . .] of 3000 BC&rdquo; (&ldquo;Brodgar Poems&rdquo; lines 10-12). Knowing where we have come from, fleshing out our understanding of the prehistoric world and, therefore, ourselves, takes the skills and multiple perspectives not only of scientists, archaeologists, architects, and anthropologists, but also essayists, poets, and more. The interdisciplinary synergy involved in comparing archaeological, anthropological, and literary interpretations of Maeshowe sheds light on the shadows of the past, raises questions about the more elusive shadows of Neolithic women, and provides historical context for our understanding of gender relations across time. Like crawling through the passage into the dark and out to the light, the empirical and literary journeys into the mysteries of Maeshowe are indeed transformative, exhuming the bones of the past that we may better nurture the seeds of the future.</p>
+ <p>ACKNOWLEDGEMENTS. Thanks are due to Edward Gale Agran, Stephen Potthoff, and the anonymous reviewers for their time and valued advice. </p>
+ <p align="center">WORKS CITED</p>
+ <p>Bevan, Archie, and Brian Murray. Eds. <u>The Collected Poems of George Mackay Brown</u>. London: John Murray, 2005. Print.</p>
+ <p>Brown, George Mackay. &ldquo;Brodgar Poems (1992).&rdquo; In Bevan and Murray.308-312. Print.</p>
+ <p>---. &ldquo;Maeshowe: Midwinter.&rdquo;1996. In Bevan and Murray. 320. Print.</p>
+ <p>---. &ldquo;Maeshowe at Midwinter.&rdquo; 1977. <u>Under Binkie&rsquo;s Brae</u>. Edinburgh: Gordon Wright Publishing, 1979. 87-88. Print.</p>
+ <p>---. &ldquo;Two Maeshowe Poems.&rdquo; 1999. In Bevan and Murray. 420-421. Print.</p>
+ <p>Card, Nick, et al. &ldquo;Bringing a Landscape to Life? Researching and Managing &lsquo;The Heart of Neolithic Orkney&rsquo; World Heritage Site.&rdquo; <u>World Archaeology</u> 39.3 (2007): 417-435. EBSCO <u>Academic Search Complete</u>. Web. 29 Jun. 2011.</p>
+ <p>Challands, Adrian, Tom Muir, and Colin Richards. &ldquo;The Great Passage Grave of Maeshowe.&rdquo; <u>Dwelling Among the Monuments: The Neolithic Village of Barnhouse, Maeshowe Passage Grave and Surrounding Monuments at Stenness, Orkney</u>. Ed. Colin Richards. Cambridge: McDonald Inst. For Archaeological Research, 2005. 229-248. Print.</p>
+ <p>Crawford, Robert. &ldquo;Maes Howe Sappho.&rdquo; <u>Yale Review</u>: 95.1 (2007): 60-65. OhioLINK Electronic Journal Center. Web. 29 Jun. 2011.</p>
+ <p>Garnham, Trevor. <u>Lines on the Landscape, Circles from the Sky: Monuments of Neolithic Orkney</u>. Stroud, Gloucestershire: Tempus, 2004. Print.</p>
+ <p>Hall, Simon W. <u>The History of Orkney Literature</u>. Edinburgh: John Donald/Birlinn Ltd., 2010. Print.</p>
+ <p>&ldquo;Heart of Neolithic Orkney WHS: Setting Project&rdquo; Historic Scotland. 2008. EBSCO <u>Academic Search Complete</u>. Web. 30 Jun. 2011.</p>
+ <p>Jamie, Kathleen. &ldquo;Darkness and Light.&rdquo; <u>Findings: Esssays on the Natural and Unnatural World</u>. Ed. Jamie. St. Paul, MN: Graywolf, 2005. 3-22. Print.</p>
+ <p>McKie, Robin. &ldquo;Neolithic Discovery: Why Orkney is the Centre of Ancient Britain.</p>
+ <p><u>The Guardian / The Observer</u>. 6 Oct. 2012. Web. 16 Mar. 2013.</p>
+ <p>Mitchison, Naomi. <u>Early in Orcadia</u>. Glasgow: Richard Drew, 1987. Print.</p>
+ <p>Jones, Si&acirc;n, and Colin Richards. &ldquo;The Villagers of Barnhouse.&rdquo; <u>Dwelling Among the Monuments: The Neolithic Village of Barnhouse, Maeshowe Passage Grave and Surrounding Monuments at Stenness, Orkney</u>. Ed. Colin Richards. Cambridge: McDonald Inst. For Archaeological Research, 2005. 195-204. Print.</p>
+ <p>Richards, Colin. &ldquo;Doorways into Another World: The Orkney-Cromarty Chambered Tombs.&rdquo; <u>Vessels for Ancestors: Essays on the Neolithic of Britain and Ireland in Honour of Audrey Henshall</u>. Ed. Niall Sharples and Alison Sheridan. Edinburgh: Edinburgh UP, 1992. 62-76. Print.</p>
+ <p>Riddoch, Lesley. &ldquo;Stone Age Marvels Which Inspire and Astonish: Wonders of Scotland.&rdquo; <u>The Scotsman</u>. 13 Feb. 2006. Web. 30 Jun. 2011.</p>
+ <p>Rose, Dilys. &ldquo;Maes Howe Nipple.&rdquo; <u>Bodywork</u>. Edinburgh. Luath Press, 2007. Print.</p>
+ <p>Schneider, Myra. &ldquo;Maeshowe.&rdquo; <u>Circling the Core</u>. London: Enitharmon Press, 2008. 23-24. Print.</p>
+ <p>Wordsworth, William. &ldquo;I wandered lonely as a cloud.&rdquo; <u>The Norton Anthology of English Literature</u>. Eighth Ed. Ed. Stephen Greenblatt and M.H. Abrams. New York: Norton, 2006. 305-306. Print.</p>
+<p><strong>Contributor's Note</strong></p>
+ <p><strong>CHARLOTTE FAIRLIE</strong> teaches English at Wilmington College, in Wilmington, Ohio. Her published work focuses on Scottish literature and rural life in literature. She is currently co-editing an anthology of poetry relating to scythes and mowing.</p></td>
+ <td valign="top"><center>
+ <a href="../index.html"> <img src="../image/btncu.gif" alt="Current Issue" border="0" height="42" width="79"></a><br>
+ <a href="../download.html" tppabs="http://www.genders.org/download.html"> <img src="../image/btndo.gif" alt="Download" tppabs="http://www.genders.org/image/btndo.gif" align="bottom" border="0" height="42" width="115"></a><br>
+ <a href="../edit.html" tppabs="http://www.genders.org/edit.html"> <img src="../image/btned.gif" alt="Editorial Board" tppabs="http://www.genders.org/image/btned.gif" align="bottom" border="0" height="50" width="80"></a><br>
+ <a href="../guide.html" tppabs="http://www.genders.org/guide.html"> <img src="../image/btngu.gif" alt="Contributor Guidelines" tppabs="http://www.genders.org/image/btngu.gif" align="bottom" border="0" height="42" width="90"></a><br>
+ <a href="../recent.html"> <img src="../image/btnre.gif" alt="Recent Issues" tppabs="http://www.genders.org/image/btnre.gif" align="bottom" border="0" height="41" width="79"></a><br>
+ <a href="../link.html"> <img src="../image/btnli.gif" alt="Links &amp; Books" border="0" height="46" width="97"></a><br>
+ </center></td>
+ </tr>
+ </tbody>
+ </table>
+ <table width="500">
+ <tbody>
+ <tr>
+ <td><p><a href="../download.html">Copyright</a> ©2010 Ann Kibbey.
+
+ All Rights Reserved Worldwide.<br>
+ </p>
+ <p> </p>
+ <center>
+ <a href="../download.html"><font size="1">Download</font></a><font size="1"> || <a href="../edit.html">Editorial Board</a> || <a href="../guide.html">Submission
+
+ Guidelines</a> || <a href="../index.html">Current Issue</a> || <a href="../recent.html">Recent Issues</a> || <a href="../link.html">Links
+
+ &amp; Books</a></font>
+ </center></td>
+ </tr>
+ </tbody>
+ </table>
+ <p></p>
+ <p align="right">
+
+ <table width="550">
+ <tbody>
+ <tr>
+ <td width="361"></td>
+ <td width="72"><p><img src="../image/algosmlr.gif" alt="Genders" align="bottom" border="0" height="72" width="72"> </p></td>
+ <td width="101"><b> <font size="1">Genders Journal</font></b> <font size="1"><br>
+ 226 UCB<br>
+ University of Colorado<br>
+ Boulder, CO 80309<br>
+ http://www.Genders.org</font></td>
+ </tr>
+ </tbody>
+ </table>
+ </p>
+ <p align="right"></p></td>
+ </tr>
+ </tbody>
+</table>
+</p>
+<p></p>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml b/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml
new file mode 100644
index 0000000..b47f85b
--- /dev/null
+++ b/python/tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml
@@ -0,0 +1,66 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+ <teiHeader/>
+ <text>
+ <front/>
+ <body/>
+ <back>
+ <div>
+ <listBibl>
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">A world of individuals</title>
+ <author>
+ <persName><forename type="first">N</forename><surname>Goodman</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Problems and projects</title>
+ <imprint>
+ <date type="published" when="1972">1972</date>
+ <biblScope unit="page" from="155" to="172" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">Implicit definition sustained</title>
+ <author>
+ <persName><forename type="first">W</forename><forename type="middle">V O</forename><surname>Quine</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">The ways of paradox and other essays</title>
+ <meeting><address><addrLine>Cambridge, MA</addrLine></address></meeting>
+ <imprint>
+ <publisher>Harvard University Press</publisher>
+ <date type="published" when="1976">1976b</date>
+ <biblScope unit="page" from="133" to="136" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133–136). Cambridge, MA: Harvard University Press.</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <monogr>
+ <title level="m" type="main">On some difficulties in the theory of transfinite numbers and order types</title>
+ <author>
+ <persName><forename type="first">B</forename><surname>Russell</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1906">1906</date>
+ <publisher>Proceedings of London Mathematical Society</publisher>
+ <biblScope unit="volume">4</biblScope>
+ <biblScope unit="page" from="29" to="53" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29–53.</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python/tests/files/grobid_refs_s1047951103000064.tei.xml b/python/tests/files/grobid_refs_s1047951103000064.tei.xml
new file mode 100644
index 0000000..e0eae8a
--- /dev/null
+++ b/python/tests/files/grobid_refs_s1047951103000064.tei.xml
@@ -0,0 +1,499 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+ <teiHeader/>
+ <text>
+ <front/>
+ <body/>
+ <back>
+ <div>
+ <listBibl>
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">The community control of rheumatic fever and rheumatic heart disease</title>
+ <author>
+ <persName><forename type="first">T</forename><surname>Strasser</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">N</forename><surname>Dondong</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Elkholy</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull. WHO</title>
+ <imprint>
+ <biblScope unit="volume">59</biblScope>
+ <biblScope unit="page" from="285" to="294" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note>Report of a WHO international co-operative project</note>
+ <note type="raw_reference">Strasser T , Dondong N , Elkholy A et al. The community control of rheumatic fever and rheumatic heart disease. Report of a WHO international co-operative project. Bull. WHO 1981; 59: 285–294.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">Acute rheumatic fever in children</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Rahimtoola</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">H</forename><surname>Rehman</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">JPMA</title>
+ <imprint>
+ <biblScope unit="volume">22</biblScope>
+ <biblScope unit="page" from="185" to="192" />
+ <date type="published" when="1972">1972</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Rahimtoola RJ , Rehman H . Acute rheumatic fever in children. JPMA 1972; 22: 185–192.</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever in Sudanese children</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Ismail</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>El Amin</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Arab J Med</title>
+ <imprint>
+ <biblScope unit="volume">2</biblScope>
+ <biblScope unit="page" from="21" to="24" />
+ <date type="published" when="1983">1983</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Ismail SA , El Amin A . Rheumatic fever in Sudanese children. Arab J Med 1983; 2: 21–24.</note>
+</biblStruct>
+
+<biblStruct xml:id="b3">
+ <analytic>
+ <title level="a" type="main">Incidence of heart disease in children at NICVD</title>
+ <author>
+ <persName><forename type="first">K</forename><forename type="middle">U</forename><surname>Aziz</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">JPMA</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="300" to="305" />
+ <date type="published" when="1984">1984</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Aziz KU . Incidence of heart disease in children at NICVD. JPMA 1984; 34: 300–305.</note>
+</biblStruct>
+
+<biblStruct xml:id="b4">
+ <monogr>
+ <title level="m" type="main">The various manifestations of rheumatic fever as exemplified in childhood and early life</title>
+ <author>
+ <persName><forename type="first">W</forename><forename type="middle">B</forename><surname>Cheadle</surname></persName>
+ </author>
+ <imprint>
+ <publisher>Smith and Co</publisher>
+ <biblScope unit="page">1889</biblScope>
+ <pubPlace>London</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Cheadle WB . The various manifestations of rheumatic fever as exemplified in childhood and early life. Smith and Co., London, 1889.</note>
+</biblStruct>
+
+<biblStruct xml:id="b5">
+ <analytic>
+ <title level="a" type="main">Community control of rheumatic heart disease in developing countries-I. A major public health problem</title>
+ </analytic>
+ <monogr>
+ <title level="j">WHO Chron</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="336" to="345" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Community control of rheumatic heart disease in developing countries-I. A major public health problem. WHO Chron 1980; 34: 336–345.</note>
+</biblStruct>
+
+<biblStruct xml:id="b6">
+ <analytic>
+ <title level="a" type="main">Prevalence of heart disease in school children of Islamabad</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">M</forename><surname>Malik</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Jaffrey</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Ahmed</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">Zubeda</forename><surname>Khanum</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pakistan Heart Journal</title>
+ <imprint>
+ <biblScope unit="volume">14</biblScope>
+ <biblScope unit="page" from="2" to="6" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Malik SM , Jaffrey S , Ahmed S , Zubeda Khanum : Prevalence of heart disease in school children of Islamabad. Pakistan Heart Journal 1981; 14: 2–6.</note>
+</biblStruct>
+
+<biblStruct xml:id="b7">
+ <analytic>
+ <title level="a" type="main">Rheumatic heart disease and overcrowding</title>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">H</forename><surname>Watkins</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">P</forename><surname>Quinn</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Am J Public Health</title>
+ <imprint>
+ <biblScope unit="volume">38</biblScope>
+ <biblScope unit="page" from="1071" to="1081" />
+ <date type="published" when="1948">1948</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Watkins JH , Quinn JP . Rheumatic heart disease and overcrowding. Am J Public Health 1948; 38: 1071–1081.</note>
+</biblStruct>
+
+<biblStruct xml:id="b8">
+ <analytic>
+ <title level="a" type="main">The spectrum and specter of rheumatic fever in 1980&apos;s</title>
+ <author>
+ <persName><forename type="first">W</forename><surname>El-Sadr</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Taranta</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Clinical Immunology Up-Date. Edited by Franklin EC</title>
+ <imprint>
+ <biblScope unit="page" from="183" to="203" />
+ <date type="published" when="1979">1979</date>
+ <publisher>Elsevier</publisher>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">El-Sadr W , Taranta A . The spectrum and specter of rheumatic fever in 1980&apos;s. In: Clinical Immunology Up-Date. Edited by Franklin EC . Elsevier, New York, 1979, pp 183–203.</note>
+</biblStruct>
+
+<biblStruct xml:id="b9">
+ <monogr>
+ <title level="m" type="main">Tonsillitis in adolescent, Bailliere Tendoll and Cox</title>
+ <author>
+ <persName><forename type="first">C</forename><surname>Haig-Brown</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1886">1886</date>
+ <pubPlace>London</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Haig-Brown C . Tonsillitis in adolescent, Bailliere Tendoll and Cox, London 1886.</note>
+</biblStruct>
+
+<biblStruct xml:id="b10">
+ <analytic>
+ <title level="a" type="main">Studies on the transmission within the families of group A hemolytic streptococci</title>
+ <author>
+ <persName><forename type="first">L</forename><forename type="middle">I</forename><surname>Levine</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">S</forename><surname>Chapman</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">V</forename><surname>Guerra</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><surname>Cooper</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">M</forename><surname>Krause</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">J Lab Clin Med</title>
+ <imprint>
+ <biblScope unit="volume">67</biblScope>
+ <biblScope unit="page" from="483" to="494" />
+ <date type="published" when="1966">1966</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Levine LI , Chapman SS , Guerra V , Cooper J , Krause RM . Studies on the transmission within the families of group A hemolytic streptococci. J Lab Clin Med 1966; 67: 483–494.</note>
+</biblStruct>
+
+<biblStruct xml:id="b11">
+ <monogr>
+ <title level="m" type="main">Rheumatic fever and rheumatic heart disease in the 1970&apos;s. WHO Chron</title>
+ <author>
+ <persName><forename type="first">T</forename><surname>Strasser</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1978">1978</date>
+ <biblScope unit="volume">32</biblScope>
+ <biblScope unit="page" from="18" to="25" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Strasser T . Rheumatic fever and rheumatic heart disease in the 1970&apos;s. WHO Chron. 1978; 32: 18–25.</note>
+</biblStruct>
+
+<biblStruct xml:id="b12">
+ <monogr>
+ <title level="m" type="main">Brittanica: Book of year 1991</title>
+ <imprint>
+ <date type="published" when="1991">1991</date>
+ <publisher>Chicago</publisher>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Brittanica: Book of year 1991. Chicago, 1991.</note>
+</biblStruct>
+
+<biblStruct xml:id="b13">
+ <monogr>
+ <title level="m" type="main">Pockets of rheumatic fever in developed world. XI World Congress of Cardiology</title>
+ <author>
+ <persName><forename type="first">R</forename><surname>Talbot</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1990">1990</date>
+ <pubPlace>Manila</pubPlace>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Talbot R . Pockets of rheumatic fever in developed world. XI World Congress of Cardiology. Manila 1990.</note>
+</biblStruct>
+
+<biblStruct xml:id="b14">
+ <analytic>
+ <title level="a" type="main">Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease</title>
+ </analytic>
+ <monogr>
+ <title level="j">Circulation</title>
+ <imprint>
+ <biblScope unit="volume">41</biblScope>
+ <biblScope unit="page" from="A1" to="15" />
+ <date type="published" when="1970">1970</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Intersociety commission for heart disease and resources. Rheumatic fever and rheumatic heart disease study group. Prevention of rheumatic fever and rheumatic heart disease. Circulation 1970; 41: A1–15.</note>
+</biblStruct>
+
+<biblStruct xml:id="b15">
+ <analytic>
+ <title level="a" type="main">Acute rheumatic fever and rheumatic carditis in children</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Rahimtoola</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">H</forename><surname>Shafqat</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Ramzan</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">3</biblScope>
+ <biblScope unit="page" from="2" to="9" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Rahimtoola RJ , Shafqat H , Ramzan A . Acute rheumatic fever and rheumatic carditis in children. Pak Heart J 1980; 3: 2–9.</note>
+</biblStruct>
+
+<biblStruct xml:id="b16">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever and rheumatic heart disease in developing countries</title>
+ <author>
+ <persName><forename type="first">S</forename><surname>Padmavati</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull. WHO</title>
+ <imprint>
+ <biblScope unit="volume">56</biblScope>
+ <biblScope unit="page" from="543" to="550" />
+ <date type="published" when="1979">1979</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Padmavati S . Rheumatic fever and rheumatic heart disease in developing countries. Bull. WHO 1979; 56: 543–550.</note>
+</biblStruct>
+
+<biblStruct xml:id="b17">
+ <analytic>
+ <title level="a" type="main">Streptococcal infections in families. Factors altering individual susceptibility</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Meyer</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Haggerty</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pediatrics</title>
+ <imprint>
+ <biblScope unit="volume">29</biblScope>
+ <biblScope unit="page" from="539" to="549" />
+ <date type="published" when="1962">1962</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Meyer RJ , Haggerty RJ . Streptococcal infections in families. Factors altering individual susceptibility. Pediatrics 1962; 29: 539–549.</note>
+</biblStruct>
+
+<biblStruct xml:id="b18">
+ <analytic>
+ <title level="a" type="main">Collagen and connective tissue diseases</title>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">A</forename><surname>Shanks</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Textbook of Pediatrics</title>
+ <editor>
+ <persName><forename type="first">J</forename><forename type="middle">A</forename><surname>Forfar</surname></persName>
+ <persName><forename type="first">C</forename><forename type="middle">C</forename><surname>Arneil</surname></persName>
+ </editor>
+ <meeting><address><addrLine>Edinburgh</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="1978">1978</date>
+ <biblScope unit="page" from="1501" to="1515" />
+ </imprint>
+ <respStmt>
+ <orgName>Churchill Livingstone</orgName>
+ </respStmt>
+ </monogr>
+ <note type="raw_reference">Shanks RA . Collagen and connective tissue diseases. In: Forfar JA , Arneil CC (eds) Textbook of Pediatrics. Churchill Livingstone, Edinburgh, 1978: 1501–1515.</note>
+</biblStruct>
+
+<biblStruct xml:id="b19">
+ <analytic>
+ <title level="a" type="main">Prophylaxis against recurrence of rheumatic fever</title>
+ <author>
+ <persName><forename type="first">A</forename><forename type="middle">G</forename><surname>Billoo</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><forename type="middle">S</forename><surname>Abbasi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Sultana</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">L</forename><surname>Desa</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Syed</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">1</biblScope>
+ <biblScope unit="page" from="8" to="14" />
+ <date type="published" when="1968">1968</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Billoo AG , Abbasi AS , Sultana S , Desa L , Syed SA . Prophylaxis against recurrence of rheumatic fever. Pak Heart J 1968; 1: 8–14.</note>
+</biblStruct>
+
+<biblStruct xml:id="b20">
+ <analytic>
+ <title level="a" type="main">Rheumatic heart disease</title>
+ <author>
+ <persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Syed</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Pak Heart J</title>
+ <imprint>
+ <biblScope unit="volume">5</biblScope>
+ <biblScope unit="page" from="14" to="16" />
+ <date type="published" when="1972">1972</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Syed SA . Rheumatic heart disease. Pak Heart J 1972; 5: 14–16.</note>
+</biblStruct>
+
+<biblStruct xml:id="b21">
+ <analytic>
+ <title level="a" type="main">Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control</title>
+ </analytic>
+ <monogr>
+ <title level="j">WHO Chron</title>
+ <imprint>
+ <biblScope unit="volume">34</biblScope>
+ <biblScope unit="page" from="389" to="395" />
+ <date type="published" when="1980">1980</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Community control of rheumatic heart disease in developing countries-II. Strategies for prevention and control. WHO Chron 1980; 34: 389–395.</note>
+</biblStruct>
+
+<biblStruct xml:id="b22">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever: Clinical profile of 339 cases with long term follow-up</title>
+ <author>
+ <persName><forename type="first">M</forename><forename type="middle">K</forename><surname>Joshi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">P</forename><forename type="middle">W</forename><surname>Kandoth</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Barve</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">J</forename><forename type="middle">R</forename><surname>Kamat</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Indian pediatr</title>
+ <imprint>
+ <biblScope unit="volume">20</biblScope>
+ <biblScope unit="page" from="849" to="853" />
+ <date type="published" when="1983">1983</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Joshi MK , Kandoth PW , Barve RJ , Kamat JR . Rheumatic fever: Clinical profile of 339 cases with long term follow-up. Indian pediatr 1983; 20: 849–853.</note>
+</biblStruct>
+
+<biblStruct xml:id="b23">
+ <analytic>
+ <title level="a" type="main">Rheumatic fever and rheumatic heart disease in rural south Indian children</title>
+ <author>
+ <persName><forename type="first">G</forename><surname>Koshi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">V</forename><surname>Benjamin</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">G</forename><surname>Cherian</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Bull WHO</title>
+ <imprint>
+ <biblScope unit="volume">59</biblScope>
+ <biblScope unit="page" from="599" to="603" />
+ <date type="published" when="1981">1981</date>
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Koshi G , Benjamin V , Cherian G . Rheumatic fever and rheumatic heart disease in rural south Indian children. Bull WHO 1981; 59: 599–603.</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python/tests/files/nature_article.html b/python/tests/files/nature_article.html
new file mode 100644
index 0000000..177da83
--- /dev/null
+++ b/python/tests/files/nature_article.html
@@ -0,0 +1,1379 @@
+
+
+
+
+
+
+
+
+<!DOCTYPE html>
+<html lang="en" class="grade-c">
+<head>
+ <meta charset="utf-8">
+<link rel="dns-prefetch" href="//ajax.googleapis.com"/>
+<link rel="dns-prefetch" href="//fonts.googleapis.com"/>
+<link rel="dns-prefetch" href="//fonts.gstatic.com"/>
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1.0, shrink-to-fit=no">
+
+ <title>More than 100 scientific journals have disappeared from the Internet</title>
+ <meta name="description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta property="og:url" content="https://www.nature.com/articles/d41586-020-02610-z"/>
+ <meta property="og:type" content="article"/>
+ <meta property="og:title" content="More than 100 scientific journals have disappeared from the Internet"/>
+ <meta property="og:description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta property="og:image"
+ content="https://media.nature.com/lw1024/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18365322.jpg"/>
+ <meta name="twitter:card" content="summary_large_image"/>
+ <meta name="twitter:site" content="@nature"/>
+ <meta name="twitter:title" content="More than 100 scientific journals have disappeared from the Internet"/>
+ <meta name="twitter:description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta name="twitter:image"
+ content="https://media.nature.com/lw1024/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18365322.jpg"/>
+
+
+ <meta name="journal_id" content="41586"/>
+
+ <meta name="dc.title" content="More than 100 scientific journals have disappeared from the Internet"/>
+
+ <meta name="dc.source" content="Nature 2020"/>
+
+ <meta name="dc.format" content="text/html"/>
+
+ <meta name="dc.publisher" content="Nature Publishing Group"/>
+
+ <meta name="dc.date" content="2020-09-10"/>
+
+ <meta name="dc.type" content="News"/>
+
+ <meta name="dc.language" content="En"/>
+
+ <meta name="dc.copyright" content="2020 Nature"/>
+
+ <meta name="dc.rightsAgent" content="journalpermissions@springernature.com"/>
+
+ <meta name="dc.description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+
+ <meta name="prism.publicationName" content="Nature"/>
+
+ <meta name="prism.publicationDate" content="2020-09-10"/>
+
+ <meta name="prism.section" content="News"/>
+
+ <meta name="prism.startingPage" content=""/>
+
+ <meta name="prism.endingPage" content=""/>
+
+ <meta name="prism.copyright" content="2020 Nature"/>
+
+ <meta name="prism.rightsAgent" content="journalpermissions@springernature.com"/>
+
+ <meta name="prism.url" content="https://www.nature.com/articles/d41586-020-02610-z"/>
+
+ <meta name="prism.doi" content="doi:10.1038/d41586-020-02610-z"/>
+
+ <meta name="dc.identifier" content="doi:10.1038/d41586-020-02610-z"/>
+
+ <meta name="DOI" content="10.1038/d41586-020-02610-z"/>
+
+ <meta name="description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+
+ <meta name="dc.creator" content="Diana Kwon"/>
+
+ <meta name="dc.subject" content="Publishing"/>
+
+
+
+<script>(function(e){var t=e.documentElement,n=e.implementation;t.className='js';if(n&&n.hasFeature('http://www.w3.org/TR/SVG11/feature#Image','1.1')){t.className+=' svg'}})(document)</script>
+<link rel="stylesheet" href="/static/css/mosaic-grade-c.26f07b2f11.css">
+
+<link rel="stylesheet" class="js-ctm" href="/static/css/magazine-mosaic-150.7f46c29843.css" media="only screen, print and (-webkit-min-device-pixel-ratio:0) and (min-color-index:0), (-ms-high-contrast: none), only all and (min--moz-device-pixel-ratio:0) and (min-resolution: 3e1dpcm)">
+
+
+ <style>
+ .c-header--brand-border {
+ border-bottom: 5px solid #000;
+ }
+ </style>
+
+<link rel="apple-touch-icon" sizes="180x180" href=/static/images/favicons/nature/apple-touch-icon.f39cb19454.png>
+<link rel="icon" type="image/png" sizes="32x32" href=/static/images/favicons/nature/favicon-32x32.3fe59ece92.png>
+<link rel="icon" type="image/png" sizes="16x16" href=/static/images/favicons/nature/favicon-16x16.951651ab72.png>
+<link rel="manifest" href=/static/manifest.1a481c42b1.json>
+<link rel="mask-icon" href=/static/images/favicons/nature/safari-pinned-tab.69bff48fe6.svg color="#000000">
+<link rel="shortcut icon" href=/static/images/favicons/nature/favicon.62367f778b.ico>
+<meta name="msapplication-TileColor" content="#000000">
+<meta name="msapplication-config" content=/static/browserconfig.e35b3b052c.xml>
+<meta name="theme-color" content="#000000">
+<meta name="application-name" content="Nature">
+
+<link rel="search" href="http://www.nature.com/search">
+<link rel="search" href="http://www.nature.com/opensearch/opensearch.xml" type="application/opensearchdescription+xml" title="nature.com">
+<link rel="search" href="http://www.nature.com/opensearch/request" type="application/sru+xml" title="nature.com">
+
+ <meta name="WT.cg_s" content="News"/>
+ <meta name="WT.z_cg_type" content="News"/>
+ <meta name="WT.page_categorisation" content="Article page"/>
+ <meta name="WT.z_subject_term" content="Publishing"/>
+
+<meta name="WT.template" content="oscar"/>
+<meta name="WT.cg_n" content="Nature"/>
+<meta name="dc.rights" content="©2020 Macmillan Publishers Limited. All Rights Reserved."/>
+<meta name="WT.z_bandiera_abtest" content="a"/>
+
+ <script data-test="dataLayer">
+ dataLayer = [{"content":{"category":{"contentType":"news","legacy":{"webtrendsPrimaryArticleType":"news","webtrendsSubjectTerms":"publishing","webtrendsContentCategory":null,"webtrendsContentCollection":null,"webtrendsContentGroup":"Nature","webtrendsContentGroupType":null,"webtrendsContentSubGroup":"News"}},"article":{"doi":"10.1038/d41586-020-02610-z"},"attributes":{"cms":"core media","deliveryPlatform":"oscar","copyright":{"open":false,"legacy":{"webtrendsLicenceType":null}}},"contentInfo":{"authors":["Diana Kwon"],"publishedAt":1599696000,"publishedAtString":"2020-09-10","title":"More than 100 scientific journals have disappeared from the Internet","legacy":null,"publishedAtTime":null,"documentType":"aplusplus"},"journal":{"pcode":"nature","title":"nature","volume":null,"issue":null},"authorization":{"status":true},"features":[{"name":"furtherReadingSection","present":false}],"collection":null},"page":{"category":{"pageType":"article"},"attributes":{"template":"magazine mosaic","featureFlags":[{"name":"ab_test_news_feature","active":false}]},"search":null},"privacy":{},"version":"1.0.0","product":null,"session":null,"user":null,"backHalfContent":false}];
+</script>
+
+<script>
+ (function() {
+ function deleteCookie (name, domain) {
+ document.cookie = encodeURIComponent(name) +
+ '=' +
+ ';path=/' +
+ ';domain=' + domain +
+ ';expires=Thu, 01 Jan 1970 00:00:00 GMT';
+ }
+
+ var consentCookieParts = ('; ' + document.cookie).split('; OptanonConsent=');
+
+ if (consentCookieParts.length > 1) {
+ consentCookieParts.shift(); // remove redundant first part from the split array
+
+ // onetrust can set the same cookie multiple times with different domain specificities
+ for (let i=0; i<consentCookieParts.length; i++) {
+ var otCookieGroups = consentCookieParts[i].split('&groups=').pop().split('&').shift();
+
+ if (otCookieGroups.indexOf('C0001') === -1) {
+ deleteCookie('OptanonConsent', 'nature.com');
+ deleteCookie('OptanonAlertBoxClosed', 'nature.com');
+ }
+ }
+ }
+ })();
+</script>
+
+<script>
+ (function(w,d,t) {
+ function cc() {
+ var h = w.location.hostname;
+ if (h.indexOf('preview-www.nature.com') > -1) return;
+
+ var e = d.createElement(t),
+ s = d.getElementsByTagName(t)[0];
+
+ if (h.indexOf('nature.com') > -1) {
+ e.src = 'https://cdn.cookielaw.org/scripttemplates/otSDKStub.js';
+ e.setAttribute('data-domain-script', '83f2c78a-6cbc-4d1a-9088-3f8e8c4c7460');
+ } else {
+ e.src = '/static/js/cookie-consent-bundle.9d49adbc02.js';
+ e.setAttribute('data-consent', h);
+ }
+ s.parentNode.insertBefore(e, s);
+ }
+
+ !!w.google_tag_manager ? cc() : window.addEventListener('gtm_loaded', function() {cc()});
+ })(window,document,'script');
+</script>
+<script>
+ function OptanonWrapper() {
+ window.dataLayer.push({event:'OneTrustGroupsUpdated'});
+ document.activeElement.blur();
+ }
+</script>
+
+
+<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+ new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
+ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
+ 'https://www.googletagmanager.com/gtm.js?id='+i+dl;
+
+
+ j.addEventListener('load', function() {
+ var _ge = new CustomEvent('gtm_loaded', { bubbles: true });
+ d.dispatchEvent(_ge);
+ });
+
+ f.parentNode.insertBefore(j,f);
+})(window,document,'script','dataLayer','GTM-NWDMT9Q');</script>
+
+
+
+</head>
+<body>
+
+
+
+<div role="banner" class="position-relative cleared z-index-50 background-white" data-test="top-containers">
+
+
+ <a class="c-skip-link u-hide-print" href="#content">Skip to main content</a>
+
+
+
+
+
+
+
+ <aside class="c-ad c-ad--728x90">
+ <div class="c-ad__inner" data-container-type="banner-advert">
+ <p class="c-ad__label">Advertisement</p>
+
+
+
+ <div id="article-doubleclickad-container">
+ <div id="div-gpt-ad-top-1"
+ class="div-gpt-ad advert leaderboard js-ad text-center hide-print grade-c-hide"
+ data-ad-type="top"
+ data-gpt-unitpath="/285/nature.com/article"
+ data-gpt-sizes="728x90"
+ data-gpt-targeting="type=article;pos=top;artid=d41586-020-02610-z;doi=10.1038/d41586-020-02610-z;subjmeta=479,648,706;kwrd=Publishing">
+ <noscript>
+ <a href="//pubads.g.doubleclick.net/gampad/jump?iu=/285/nature.com/article&amp;sz=728x90&amp;c=766965215&amp;t=pos%3Dtop%26type%3Darticle%26artid%3Dd41586-020-02610-z%26doi%3D10.1038/d41586-020-02610-z%26subjmeta%3D479,648,706%26kwrd%3DPublishing">
+ <img data-test="gpt-advert-fallback-img"
+ src="//pubads.g.doubleclick.net/gampad/ad?iu=/285/nature.com/article&amp;sz=728x90&amp;c=766965215&amp;t=pos%3Dtop%26type%3Darticle%26artid%3Dd41586-020-02610-z%26doi%3D10.1038/d41586-020-02610-z%26subjmeta%3D479,648,706%26kwrd%3DPublishing"
+ alt="Advertisement"
+ width="728"
+ height="90"></a>
+ </noscript>
+ </div>
+</div>
+
+
+
+
+ </div>
+ </aside>
+
+
+
+
+
+ <div class="c-grade-c-banner u-hide">
+ <div class="c-grade-c-banner__container">
+
+ <p>Thank you for visiting nature.com. You are using a browser version with limited support for CSS. To obtain
+ the best experience, we recommend you use a more up to date browser (or turn off compatibility mode in
+ Internet Explorer). In the meantime, to ensure continued support, we are displaying the site without styles
+ and JavaScript.</p>
+
+ </div>
+ </div>
+
+
+
+
+ <header class="c-header c-header--brand-border" id="header" data-header>
+ <div class="c-header__row-border">
+ <div class="c-header__container">
+ <div class="c-header__layout">
+ <a href="/nature"
+ data-track="click" data-track-action="home" data-track-category="nature-150-split-header" data-track-label="image">
+ <picture class="c-header__logo">
+ <source srcset="//media.springernature.com/full/nature-cms/uploads/product/nature/header-86f1267ea01eccd46b530284be10585e.svg" media="(min-width: 769px)">
+ <img src="//media.springernature.com/full/nature-cms/uploads/product/nature/header-86f1267ea01eccd46b530284be10585e.svg" alt="Nature">
+ </picture>
+ </a>
+ <div class="c-header__layout">
+
+ <div class="c-header__site-navigation c-header__site-navigation--show-at-md"
+ data-test="siteindex-link">
+ <a class="c-header__link" href="https://www.nature.com/siteindex"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open nature research index" data-track-label="link">
+ <span>View all Nature Research journals</span>
+ </a>
+ </div>
+
+ <div class="c-header__site-navigation c-header__site-navigation--border">
+ <a class="c-header__link"
+ href="#search-menu"
+ data-header-expander
+ data-test="search-link" data-track="click" data-track-category="nature-150-split-header" data-track-action="open search tray" data-track-label="button">
+ <span>Search</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M16.48 15.455c.283.282.29.749.007 1.032a.738.738 0 01-1.032-.007l-3.045-3.044a7 7 0 111.026-1.026zM8 14A6 6 0 108 2a6 6 0 000 12z"/></svg>
+ </a>
+ <a href="/nams/svc/myaccount"
+ id="my-account"
+ class="c-header__link placeholder"
+ data-test="login-link" data-track="click" data-track-action="my account" data-track-category="nature-150-split-header" data-track-label="link">
+ <span>My Account</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M10.238 16.905a7.96 7.96 0 003.53-1.48c-.874-2.514-2.065-3.936-3.768-4.319V9.83a3.001 3.001 0 10-2 0v1.277c-1.703.383-2.894 1.805-3.767 4.319A7.96 7.96 0 009 17c.419 0 .832-.032 1.238-.095zm4.342-2.172a8 8 0 10-11.16 0c.757-2.017 1.84-3.608 3.49-4.322a4 4 0 114.182 0c1.649.714 2.731 2.305 3.488 4.322zM9 18A9 9 0 119 0a9 9 0 010 18z" fill="#333" fill-rule="evenodd"/></svg>
+</a>
+<a href="https://idp.nature.com/authorize/natureuser?client_id&#x3D;grover&amp;redirect_uri&#x3D;https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z"
+ id="login-button"
+ style="display: none;"
+ class="c-header__link placeholder"
+ data-test="login-link" data-track="click" data-track-action="login" data-track-category="nature-150-split-header" data-track-label="link">
+ <span>Login</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M10.238 16.905a7.96 7.96 0 003.53-1.48c-.874-2.514-2.065-3.936-3.768-4.319V9.83a3.001 3.001 0 10-2 0v1.277c-1.703.383-2.894 1.805-3.767 4.319A7.96 7.96 0 009 17c.419 0 .832-.032 1.238-.095zm4.342-2.172a8 8 0 10-11.16 0c.757-2.017 1.84-3.608 3.49-4.322a4 4 0 114.182 0c1.649.714 2.731 2.305 3.488 4.322zM9 18A9 9 0 119 0a9 9 0 010 18z" fill="#333" fill-rule="evenodd"/></svg>
+</a>
+
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+
+ <div class="c-header__container" data-test="c-header__container">
+ <ul class="c-header__menu">
+
+ <li class="c-header__item" data-test="explore-content-button">
+ <a href="#explore"
+ class="c-header__link c-header__link--dropdown"
+ data-header-expander
+ data-test="menu-button"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open explore expander" data-track-label="button">
+ <span>Explore <span class="c-header__show-text">our content</span></span><svg role="img" aria-hidden="true" focusable="false" height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m5.58578644 3-3.29289322-3.29289322c-.39052429-.39052429-.39052429-1.02368927 0-1.41421356s1.02368927-.39052429 1.41421356 0l4 4c.39052429.39052429.39052429 1.02368927 0 1.41421356l-4 4c-.39052429.39052429-1.02368927.39052429-1.41421356 0s-.39052429-1.02368927 0-1.41421356z" transform="matrix(0 1 -1 0 11 3)"/></svg>
+ </a>
+ </li>
+
+ <li class="c-header__item">
+ <a href="#journal-info"
+ class="c-header__link c-header__link--dropdown"
+ data-header-expander
+ data-test="menu-button"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open journal information expander" data-track-label="button">
+ <span>Journal info<span class="c-header__show-text">rmation</span></span><svg role="img" aria-hidden="true" focusable="false" height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m5.58578644 3-3.29289322-3.29289322c-.39052429-.39052429-.39052429-1.02368927 0-1.41421356s1.02368927-.39052429 1.41421356 0l4 4c.39052429.39052429.39052429 1.02368927 0 1.41421356l-4 4c-.39052429.39052429-1.02368927.39052429-1.41421356 0s-.39052429-1.02368927 0-1.41421356z" transform="matrix(0 1 -1 0 11 3)"/></svg>
+ </a>
+ </li>
+
+ <li class="c-header__item c-header__item--pipe">
+ <a class="c-header__link"
+ href="https://www.nature.com/nature/subscribe"
+ data-track="click"
+ data-track-action="subscribe"
+ data-track-category="nature-150-split-header"
+ data-track-label="link">
+ <span>Subscribe</span>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+
+ </header>
+
+
+
+
+ <div class="u-mb-16">
+ <div class="u-container">
+ <ol class="c-breadcrumbs">
+ <li class="c-breadcrumbs__item" id="breadcrumb0"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb1"><a class="c-breadcrumbs__link"
+ href="/"
+ itemprop="url"
+ data-track="click" data-track-action="breadcrumb" data-track-category="header" data-track-label="link:nature"><span itemprop="title">nature</span></a><svg class="c-icon c-breadcrumbs__chevron" aria-hidden="true" focusable="false" height="10" viewBox="0 0 10 10" width="10" xmlns="http://www.w3.org/2000/svg"><path d="m5.96738168 4.70639573 2.39518594-2.41447274c.37913917-.38219212.98637524-.38972225 1.35419292-.01894278.37750606.38054586.37784436.99719163-.00013556 1.37821513l-4.03074001 4.06319683c-.37758093.38062133-.98937525.38100976-1.367372-.00003075l-4.03091981-4.06337806c-.37759778-.38063832-.38381821-.99150444-.01600053-1.3622839.37750607-.38054587.98772445-.38240057 1.37006824.00302197l2.39538588 2.4146743.96295325.98624457z" fill="#666" fill-rule="evenodd" transform="matrix(0 -1 1 0 0 10)"/></svg></li><li class="c-breadcrumbs__item" id="breadcrumb1"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb2"><a class="c-breadcrumbs__link"
+ href="/nature/articles?type&#x3D;news"
+ itemprop="url"
+ data-track="click" data-track-action="breadcrumb" data-track-category="header" data-track-label="link:news"><span itemprop="title">news</span></a><svg class="c-icon c-breadcrumbs__chevron" aria-hidden="true" focusable="false" height="10" viewBox="0 0 10 10" width="10" xmlns="http://www.w3.org/2000/svg"><path d="m5.96738168 4.70639573 2.39518594-2.41447274c.37913917-.38219212.98637524-.38972225 1.35419292-.01894278.37750606.38054586.37784436.99719163-.00013556 1.37821513l-4.03074001 4.06319683c-.37758093.38062133-.98937525.38100976-1.367372-.00003075l-4.03091981-4.06337806c-.37759778-.38063832-.38381821-.99150444-.01600053-1.3622839.37750607-.38054587.98772445-.38240057 1.37006824.00302197l2.39538588 2.4146743.96295325.98624457z" fill="#666" fill-rule="evenodd" transform="matrix(0 -1 1 0 0 10)"/></svg></li><li class="c-breadcrumbs__item" id="breadcrumb2"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb3"><span itemprop="title">article</span></li>
+ </ol>
+ </div>
+ </div>
+
+
+
+
+
+
+</div>
+
+
+ <div id="content" class="article-page position-relative z-index-1">
+ <section class="container highlight-container article-page--news container-with-gap">
+ <article class="article-item article-item--open" itemscope="" itemtype="http://schema.org/NewsArticle"
+ data-track-component="news">
+ <div class="container cleared container-type-article" data-container-type="article" itemprop="articleBody">
+ <div class="content position-relative cleared clear mq1200-padded" data-component="article-container"
+ role="main">
+ <header class="article-item__header clear cleared pull--both">
+ <div class="article__type">NEWS
+ <div class="ml10 article__date">
+ <time itemprop="datePublished">10 September 2020</time>
+ </div>
+ </div>
+
+ <div class="clear cleared"></div>
+ <h1 class="article-item__title serif" itemprop="headline">More than 100 scientific journals have disappeared from the Internet</h1>
+
+ <div class="article-item__teaser-text serif">
+ Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk.
+ </div>
+ </header>
+
+ <div class="clear cleared"></div>
+
+ <div class="bordered-container clear cleared pull--both">
+ <div id="author-affiliations" class="tab-group text14" role="tablist" data-test="author-affiliations" data-tab-group>
+ <div class="cleared">
+
+ <div id="author-affiliation-news-0" class="tab-box js-box-wrapper">
+ <h3 id="author-affiliation-news-0-head" data-track="click" data-track-label="view author info" class="sans-serif strong tab tab-skin ma0" role="tab"
+ aria-controls="author-affiliation-news-0-content" data-tooltip="Show author information">
+ Diana Kwon
+ </h3>
+ <div id="author-affiliation-news-0-content" class="tab-content pin-right grid grid-12 last"
+ role="tabpanel">
+ <div class="pa10" aria-labelledby="author-affiliation-news-0-head">
+ <div class="clear cleared">
+
+
+ <div class="align-left">
+ <h4 class="sans-serif">Search for this author in:</h4>
+ <ul class="ma0 clean-list">
+ <li class="strong"><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd&#x3D;search&amp;term&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Pub Med" >Pub Med</a></li>
+
+ <li class="strong"><a href="https://www.nature.com/search?order&#x3D;date_desc&amp;q&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Nature.com" >Nature.com</a></li>
+
+ <li class="strong"><a href="https://scholar.google.co.uk/scholar?as_q&#x3D;&amp;btnG&#x3D;Search+Scholar&amp;as_sauthors&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Google Scholar" >Google Scholar</a></li>
+ </ul>
+ </div>
+
+
+
+ </div>
+ </div>
+ </div>
+ </div>
+
+ </div>
+</div>
+
+ </div>
+
+ <div class="clear cleared pull--both">
+ <ul class="social clean-list inline-list hide-print">
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="twitter" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="https://twitter.com/intent/tweet?text=More+than+100+scientific+journals+have+disappeared+from+the+Internet&url=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share on Twitter</title>
+ <desc>Share on Twitter</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M20.8125,11.4875 C21.42,11.10375 21.8875,10.49625 22.105,9.7725 C21.5375,10.1275 20.90875,10.385 20.23875,10.5225 C19.70625,9.9225 18.9425,9.545 18.0975,9.545 C16.475,9.545 15.16,10.9325 15.16,12.6425 C15.16,12.885 15.185,13.1225 15.235,13.3475 C12.7975,13.2175 10.63125,11.985 9.1825,10.11 C8.93,10.56875 8.785,11.10125 8.785,11.66875 C8.785,12.74375 9.30375,13.69125 10.09125,14.2475 C9.61125,14.23125 9.1575,14.09 8.76125,13.86 L8.76125,13.8975 C8.76125,15.3975 9.77375,16.65125 11.11875,16.935 C10.87125,17.0075 10.6125,17.04375 10.34375,17.04375 C10.15625,17.04375 9.96875,17.025 9.79125,16.98875 C10.16625,18.22125 11.24875,19.11875 12.535,19.1425 C11.52875,19.97375 10.2625,20.4675 8.885,20.4675 C8.6475,20.4675 8.415,20.455 8.185,20.42625 C9.485,21.30375 11.02875,21.81625 12.6875,21.81625 C18.09,21.81625 21.04375,17.095 21.04375,13.00125 L21.03625,12.60125 C21.61125,12.16375 22.11125,11.6175 22.50125,10.99625 C21.97375,11.2425 21.4075,11.40875 20.81375,11.48375 L20.8125,11.4875 Z"
+ fill-rule="nonzero"></path>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="facebook" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="http://www.facebook.com/sharer.php?u=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share on Facebook</title>
+ <desc>Share on Facebook</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M15.89625,22.8625 L12.57125,22.8625 L12.57125,15.02125 L10.90875,15.02125 L10.90875,12.31875 L12.57125,12.31875 L12.57125,10.69625 C12.57125,8.4925 13.50875,7.18 16.175,7.18 L18.39375,7.18 L18.39375,9.8825 L17.00625,9.8825 C15.96875,9.8825 15.9,10.26 15.9,10.965 L15.895,12.3175 L18.4075,12.3175 L18.115,15.02 L15.89625,15.02 L15.89625,22.8625 Z"
+ fill-rule="nonzero"></path>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="email" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="mailto:?subject=More than 100 scientific journals have disappeared from the Internet&body=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share via E-Mail</title>
+ <desc>Share via E-Mail</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M15,15.3269887 L10.6248577,11.9177869 C10.4236021,11.7609644 10.1299323,11.7927468 9.96892789,11.988775 C9.80792343,12.1848031 9.84055341,12.4708451 10.041809,12.6276676 L14.7012493,16.2584003 C14.8680779,16.3940555 15.1152493,16.4013884 15.2915244,16.2640313 C15.2939898,16.2622325 15.2963784,16.2603294 15.2987507,16.2584003 L19.958191,12.6276676 C20.1594466,12.4708451 20.1920766,12.1848031 20.0310721,11.988775 C19.8700677,11.7927468 19.5763979,11.7609644 19.3751423,11.9177869 L15,15.3269887 Z M9,10 L21,10 C21.5522847,10 22,10.4477153 22,11 L22,19 C22,19.5522847 21.5522847,20 21,20 L9,20 C8.44771525,20 8,19.5522847 8,19 L8,11 C8,10.4477153 8.44771525,10 9,10 Z"></path>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+</ul>
+
+ </div>
+
+
+
+
+ <div class="align-left">
+
+ <div class="article__body serif cleared">
+ <p>Scholarly journals are supposed to provide a lasting record of science. But over the past two decades, 176 open-access journals — and many of the papers published in them — have disappeared from the Internet, according to an analysis published on 27 August<sup><a href="#ref-CR1" data-track="click" data-action="anchor-link" data-track-label="go to reference" data-track-category="references">1</a></sup>.</p><p>“There shouldn’t really be any decay or loss in scientific publications, particularly those that have been open on the web,†says Mikael Laakso, an information scientist at the Hanken School of Economics in Helsinki, and a co-author of the study, which was posted on the arXiv preprint server. He and his colleagues identified 176 titles whose online presence vanished between 2000 and 2019.</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/news/investigating-journals-the-dark-side-of-publishing-1.12666" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"><h1 class="recommended__title serif">Investigating journals: The dark side of publishing</h1></a>
+ </aside></p><p>More than half of these journals were in the social sciences and humanities, although life sciences, health sciences, physical sciences and mathematics were also represented. Eighty-eight of the journals were affiliated with a scholarly society or a research institution. The analysis also identified 900 journals that are still online but seem to have stopped publishing papers, so might be vulnerable to vanishing in the near future.</p><p>The study lays out a "compelling case" for the vulnerability of online journals, says Elizabeth Lightfoot, a librarian at Florida International University in Miami.</p><h2>Vanishing journals</h2><p>Journals can disappear from the Internet for a number of reasons, says Laakso. The publisher might stop paying to keep its publication’s webpage afloat, for example, or journals might be hosted on an online platform that belongs to an academic institution and is left behind when the site or server is updated.</p><p>Journals are supposed to be preserved in digital archives when this happens. Services such as the LOCKSS (Lots of Copies Keep Stuff Safe) Program, which was launched by Stanford Libraries in 1999, aim to ensure that publications remain available even when the publisher is no longer around. LOCKSS works by making multiple copies of content that is stored on the servers of participating libraries, who pay an annual fee to have their collections preserved. Similar initiatives, including CLOCKSS, Portico and the Public Knowledge Project’s Preservation Network (PKP PN), have emerged over the past two decades. These vary in cost and coverage: Some work with libraries, others with publishers — services such as PKP PN are free for journals that sign up. Tens of thousands of titles are currently curated in such preservation schemes. But, Laakso says, there are dozens of journals that fall through the cracks.</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/articles/d41586-018-06178-7" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"><h1 class="recommended__title serif">Radical open-access plan could spell end to journal subscriptions</h1></a>
+ </aside></p><p>Pinning down whether a journal is truly unavailable online is a challenge, because there is no single database that tracks the activity of open-access journals, says Lisa Matthias, one of the authors of the study and a PhD student at the Free University of Berlin. Databases such as the Directory of Open Access Journals (DOAJ) don’t keep track of journals that no longer publish — and journals that cease publishing or stop maintaining their presence on the web usually do so silently.</p><p>To find out how many journals had vanished, the team manually collected historical data from several lists of titles, including the DOAJ, Ulrichsweb and Scopus. Then they checked to see if any of the titles they identified were listed on the Keepers Registry, which keeps track of journals that are enrolled into digital preservation schemes. Finally, they went to the Internet Archive’s Wayback Machine to access snapshots of now-offline journals’ websites to see when they had last published, and when the content was last available on the Internet. Journals were considered “vanished†if less than 50% of their content was still freely available online (the researchers acknowledge that some journals could exist in print form or behind a paywall).</p><p>The majority of the 176 vanished journals had disappeared within 5 years of becoming inactive — the point at which they stopped publishing papers. Around one-third of them disappeared within one year of the last publication. The researchers used this ‘life cycle’ to estimate that another 900 inactive open-access journalscould be at risk of vanishing.</p><h2>Preserving the literature</h2><p>Subscription journals were not included in the study, Laakso says, because paywalls mean that they would have had to have used a different method to collect the data. He adds that because of this and other limitations, the study probably underestimates the number of journals that have disappeared. “It’s really hard to pin down when something doesn't absolutely exist, but we tried our best,†Laakso says. “We hope that there will be more refined and automatic ways to detect these in the future.â€</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/articles/d41586-019-02038-0" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16870448.jpg"><h1 class="recommended__title serif">India culls hundreds more ‘dubious’ journals from government approved list</h1></a>
+ </aside></p><p>Thib Guicherd-Callin, the acting manager of the LOCKSS Program, says it’s not surprising that there are journals that aren't captured by existing preservation services. Although many groups have used the open-source LOCKSS software, efforts to launch digital preservation initiatives are still “woefully underfundedâ€, he adds. “The desire to preserve these at-risk works is there,†he adds, but few institutions are investing the resources necessary to identify these publications and make sure they’re included in a digital preservation scheme.</p><p>Matthias says that the responsibility for ensuring inactive journals don’t disappear should be shared between publishers, authors, librarians and preservation services. Lightfoot agrees that a coordinated and collaborative effort is necessary. However, she adds, “the twin challenges of what that effort might look like and who would fund it make the pathway forward murky at bestâ€.</p>
+ </div>
+
+ <div class="emphasis">doi: <a href="https://doi.org/10.1038/d41586-020-02610-z">https://doi.org/10.1038/d41586-020-02610-z</a></div>
+ <div class="anchor-link mt40" data-toggle="anchor-links"></div>
+ <div id="references" class="references" data-toggle="anchor-links-section" data-label="References" data-concertina="true">
+ <section aria-labelledby="Bib1"><div class="serif article-section js-article-section cleared clear" id="Bib1-section"><h2 class="js-section-title section-title strong position-relative tighten-line-height background-gray-light pt20 pb6 pl0 pr20 standard-space-below small-space-above mq640-pt10 mq640-pb10 mq640-pl20 mq640-mt0 mq640-ml-20 mq640-mr-20 extend-left" id="Bib1">References</h2><div class="pl20 mq875-pl0 js-collapsible-section" id="Bib1-content"><div data-container-section="references"><ol class="clean-list ma0 standard-space-below indented-list" data-test="references-list"><li class="small-space-below border-gray-medium border-bottom-1 position-relative js-ref-item" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/Article" data-test="citation"><span class="indented-counter serif h2 tighten-line-height text-right position-absolute grade-c-hide">1.</span><p class="tiny-space-below" id="ref-CR1">Laakso, M., Matthias, L. &amp; Jahn, N. Preprint at <a href="https://arxiv.org/abs/2008.11933">https://arxiv.org/abs/2008.11933</a> (2020).</p><ul class="js-ref-links clean-list cleared strong sans-serif text13 hide-print small-space-below"><li class="pin-right"><ul class="clean-list ma0"></ul></li></ul></li></ol><p class="hide-print text-right"><a href="/articles/d41586-020-02610-z-references.ris" class="text14 sans-serif strong" data-track="click" data-track-action="download citation references" data-track-label="link">Download references</a></p></div></div></div></section>
+ </div>
+
+
+
+
+
+
+ <div class="nature-briefing nature-briefing-box mt0 cleared hide-print" data-component-id="nature-briefing-box" data-track="in-view" data-track-action="in-view" data-track-category="nature briefing" data-track-label="inPage box visible">
+ <div class="nature-briefing-box__header pa20">
+ <h1 class="h2 strong pb10 extra-tight-line-height">Nature Briefing</h1>
+ <p class="nature-briefing-box__standfirst mb0 sans-serif tighten-line-height">An essential round-up of science news, opinion and analysis, delivered to your inbox every weekday.</p>
+ </div>
+ <form action="/briefing/signup/formfeedback" method="post" class="nature-briefing-box__form pa20" data-location="box" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-box-signup-form-inPage-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBox-inPage">
+ <input id="briefing-box-signup-form-inPage-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBox">
+ <label class="nature-briefing-box__input-label block strong" for="box-inPage-EmailAddressInput">Email address</label>
+ <input class="nature-briefing-box__input-input block border-all-1 equalize-line-height pa10 mb10 box-sizing grid-12" type="email" id="box-inPage-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-box-email-input">
+
+ <div class="mb20 position-relative" role="group">
+ <input class="nature-briefing-box__checkbox-checkbox" id="gdpr-briefing-box-inPage-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-box-gdpr-checkbox" required>
+ <label class="nature-briefing-box__checkbox-label tighten-line-height" for="gdpr-briefing-box-inPage-checkbox">Yes! Sign me up to receive the daily <em>Nature Briefing</em> email. I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+
+ <button type="submit" class="nature-briefing-box__submit-button c-btn--squared" data-test-element="briefing-box-signup-button">Sign up</button>
+
+ </form>
+ </div>
+
+
+
+
+ </div>
+
+ <aside class="article__aside align-right">
+ <div class="related-content shrink--aside hide-print">
+
+ <h3 class="aside__title sans-serif">Related Articles</h3>
+ <ul class="ma0 clean-list">
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-06178-7" data-track="click"
+ data-track-label="related article (rank:0)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"
+ alt="Radical open-access plan could spell end to journal subscriptions">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"
+ alt="Radical open-access plan could spell end to journal subscriptions">
+ </noscript>
+
+ Radical open-access plan could spell end to journal subscriptions
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/news/investigating-journals-the-dark-side-of-publishing-1.12666" data-track="click"
+ data-track-label="related article (rank:1)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"
+ alt="Investigating journals: The dark side of publishing">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"
+ alt="Investigating journals: The dark side of publishing">
+ </noscript>
+
+ Investigating journals: The dark side of publishing
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-020-01066-5" data-track="click"
+ data-track-label="related article (rank:2)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18030798.jpg"
+ alt="Nature to join open-access Plan S, publisher says">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18030798.jpg"
+ alt="Nature to join open-access Plan S, publisher says">
+ </noscript>
+
+ Nature to join open-access Plan S, publisher says
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-07557-w" data-track="click"
+ data-track-label="related article (rank:3)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16355294.jpg"
+ alt="Funders flesh out details of Europe’s bold open-access plan">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16355294.jpg"
+ alt="Funders flesh out details of Europe’s bold open-access plan">
+ </noscript>
+
+ Funders flesh out details of Europe’s bold open-access plan
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-07245-9" data-track="click"
+ data-track-label="related article (rank:4)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_17334214.jpg"
+ alt="AI peer reviewers unleashed to ease publishing grind">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_17334214.jpg"
+ alt="AI peer reviewers unleashed to ease publishing grind">
+ </noscript>
+
+ AI peer reviewers unleashed to ease publishing grind
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/news/open-access-the-true-cost-of-science-publishing-1.12676" data-track="click"
+ data-track-label="related article (rank:5)">
+
+ The true cost of science publishing
+ </a>
+ </h3>
+ </li>
+
+ </ul>
+ </div>
+
+ <div class="article__subjects bordered-container shrink--aside hide-print">
+ <h3 class="aside__title sans-serif">Subjects</h3>
+ <ul class="ma0 subject-list cleared clean-list inline-list">
+
+ <li class="subject"><a href="/subjects/publishing" data-track="click"
+ data-track-label="subject (rank:0)">Publishing</a>
+ </li>
+
+ </ul>
+ </div>
+
+
+
+<div id="div-gpt-ad-right-2"
+ class="div-gpt-ad medium-rectangle advert js-ad text-center hide-print grade-c-hide"
+ data-gpt-unitpath="/285/nature.com/article"
+ data-gpt-sizes="300x250"
+ data-gpt-targeting="pos=right;artid=/articles/d41586-020-02610-z;path=/articles/d41586-020-02610-z"
+ data-ad-type="right"
+ >
+ <noscript>
+ <a href="//pubads.g.doubleclick.net/gampad/jump?iu=/285/nature.com/article&amp;sz=300x250&amp;c=1791348774&amp;t=pos%3Dright%26artid%3D/articles/d41586-020-02610-z">
+ <img data-test="gpt-advert-fallback-img"
+ src="//pubads.g.doubleclick.net/gampad/ad?iu=/285/nature.com/article&amp;sz=300x250&amp;c=1791348774&amp;t=pos%3Dright%26artid%3D/articles/d41586-020-02610-z"
+ alt="Advertisement"
+ width="300"
+ height="250"/>
+ </a>
+ </noscript>
+</div>
+
+
+ <div class="nature-briefing--sidebar bordered-container shrink--aside hide-print">
+
+
+ <div class="nature-briefing nature-briefing-box mt0 cleared hide-print" data-component-id="nature-briefing-box" data-track="in-view" data-track-action="in-view" data-track-category="nature briefing" data-track-label="sidebar box visible">
+ <div class="nature-briefing-box__header pa20">
+ <h1 class="h2 strong pb10 extra-tight-line-height">Sign up to Nature Briefing</h1>
+ <p class="nature-briefing-box__standfirst mb0 sans-serif tighten-line-height">An essential round-up of science news, opinion and analysis, delivered to your inbox every weekday.</p>
+ </div>
+ <form action="/briefing/signup/formfeedback" method="post" class="nature-briefing-box__form pa20" data-location="box" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-box-signup-form-sidebar-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBox-sidebar">
+ <input id="briefing-box-signup-form-sidebar-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBox">
+ <label class="nature-briefing-box__input-label block strong" for="box-sidebar-EmailAddressInput">Email address</label>
+ <input class="nature-briefing-box__input-input block border-all-1 equalize-line-height pa10 mb10 box-sizing grid-12" type="email" id="box-sidebar-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-box-email-input">
+
+ <div class="mb20 position-relative" role="group">
+ <input class="nature-briefing-box__checkbox-checkbox" id="gdpr-briefing-box-sidebar-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-box-gdpr-checkbox" required>
+ <label class="nature-briefing-box__checkbox-label tighten-line-height" for="gdpr-briefing-box-sidebar-checkbox">Yes! Sign me up to receive the daily <em>Nature Briefing</em> email. I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+
+ <button type="submit" class="nature-briefing-box__submit-button c-btn--squared" data-test-element="briefing-box-signup-button">Sign up</button>
+
+ </form>
+ </div>
+
+
+</div>
+
+ </aside>
+ </div>
+ </div>
+ <div data-microformat-only="" itemscope="" itemprop="publisher" itemtype="https://schema.org/Organization">
+ <meta content="Macmillan Publishers Limited, part of Springer Nature" itemprop="name"/>
+ </div>
+ <div data-microformat-only="" itemscope="" itemprop="author" itemtype="https://schema.org/Organization">
+ <meta content="Nature Editorial" itemprop="name"/>
+ </div>
+ <img src="/platform/track/article/d41586-020-02610-z" width="1" height="1" alt="" class="visually-hidden"/>
+</article>
+
+
+
+
+
+
+
+<div class="c-site-messages message hide u-hide-print c-site-messages--nature-briefing c-site-messages--nature-briefing-email-variant c-site-messages--nature-briefing-redesign-2020 sans-serif"
+data-component-id="nature-briefing-banner"
+data-component-expirydays="30"
+data-component-trigger-scroll-percentage="15"
+data-track="in-view"
+data-track-action="in-view"
+data-track-category="nature briefing"
+data-track-label="redesign banner visible">
+
+
+ <div class="c-site-messages__banner-large">
+
+
+<div class="c-site-messages__close-container ">
+ <button class="c-site-messages__close"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner dismiss">
+ <span class="">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg width="25px" height="25px" focusable="false" aria-hidden="true" viewBox="0 0 25 25" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Close banner</title>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <rect opacity="0" x="0" y="0" width="25" height="25"></rect>
+ <path d="M6.29679575,16.2772478 C5.90020818,16.6738354 5.90240728,17.3100587 6.29617427,17.7038257 C6.69268654,18.100338 7.32864195,18.0973145 7.72275218,17.7032043 L12,13.4259564 L16.2772478,17.7032043 C16.6738354,18.0997918 17.3100587,18.0975927 17.7038257,17.7038257 C18.100338,17.3073135 18.0973145,16.671358 17.7032043,16.2772478 L13.4259564,12 L17.7032043,7.72275218 C18.0997918,7.32616461 18.0975927,6.68994127 17.7038257,6.29617427 C17.3073135,5.89966201 16.671358,5.90268552 16.2772478,6.29679575 L12,10.5740436 L7.72275218,6.29679575 C7.32616461,5.90020818 6.68994127,5.90240728 6.29617427,6.29617427 C5.89966201,6.69268654 5.90268552,7.32864195 6.29679575,7.72275218 L10.5740436,12 L6.29679575,16.2772478 Z" fill="#ffffff"></path>
+ </g>
+ </svg>
+ </span>
+ <span class="visually-hidden">Close</span>
+ </button>
+</div>
+
+
+ <div class="c-site-messages__form-container">
+
+
+
+ <div class="grid grid-12 last">
+ <div class="grid grid-4">
+ <img alt="Nature Briefing" src="/static/images/logos/nature-briefing-logo-n150-white.d81c9da3ec.svg" width="250" height="40">
+ <p class="c-site-messages--nature-briefing__strapline extra-tight-line-height">Sign up for the <em>Nature Briefing</em> newsletter — what matters in science, free to your inbox daily.</p>
+ </div>
+ <div class="grid grid-8 last">
+ <form action="/briefing/signup/formfeedback" method="post" data-location="banner" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-banner-signup-form-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBannerRedesign2020">
+ <input id="briefing-banner-signup-form-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBanner">
+ <label class="nature-briefing-banner__email-label" for="banner-EmailAddressInput">Email address</label>
+
+ <div class="nature-briefing-banner__email-wrapper">
+ <input class="nature-briefing-banner__email-input box-sizing text14" type="email" id="banner-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-emailbanner-email-input">
+ <button type="submit" class="nature-briefing-banner__submit-button box-sizing text14" data-test-element="briefing-emailbanner-signup-button">Sign up</button>
+ </div>
+
+ <div class="nature-briefing-banner__checkbox-wrapper grid grid-12 last">
+ <input class="nature-briefing-banner__checkbox-checkbox" id="gdpr-briefing-banner-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-emailbanner-gdpr-checkbox" required>
+ <label class="nature-briefing-banner__checkbox-label box-sizing text13 sans-serif block tighten-line-height" for="gdpr-briefing-banner-checkbox">I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+ </form>
+ </div>
+ </div>
+
+
+ </div>
+
+ </div>
+
+
+ <div class="c-site-messages__banner-small">
+
+
+<div class="c-site-messages__close-container ">
+ <button class="c-site-messages__close"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner dismiss">
+ <span class="">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg width="25px" height="25px" focusable="false" aria-hidden="true" viewBox="0 0 25 25" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Close banner</title>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <rect opacity="0" x="0" y="0" width="25" height="25"></rect>
+ <path d="M6.29679575,16.2772478 C5.90020818,16.6738354 5.90240728,17.3100587 6.29617427,17.7038257 C6.69268654,18.100338 7.32864195,18.0973145 7.72275218,17.7032043 L12,13.4259564 L16.2772478,17.7032043 C16.6738354,18.0997918 17.3100587,18.0975927 17.7038257,17.7038257 C18.100338,17.3073135 18.0973145,16.671358 17.7032043,16.2772478 L13.4259564,12 L17.7032043,7.72275218 C18.0997918,7.32616461 18.0975927,6.68994127 17.7038257,6.29617427 C17.3073135,5.89966201 16.671358,5.90268552 16.2772478,6.29679575 L12,10.5740436 L7.72275218,6.29679575 C7.32616461,5.90020818 6.68994127,5.90240728 6.29617427,6.29617427 C5.89966201,6.69268654 5.90268552,7.32864195 6.29679575,7.72275218 L10.5740436,12 L6.29679575,16.2772478 Z" fill="#ffffff"></path>
+ </g>
+ </svg>
+ </span>
+ <span class="visually-hidden">Close</span>
+ </button>
+</div>
+
+
+ <div class="c-site-messages__content text14">
+ <span class="c-site-messages--nature-briefing__strapline strong serif">Get the most important science stories of the day, free in your inbox.</span>
+ <a class="nature-briefing__link text14 sans-serif"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner CTA to site"
+ data-test-element="briefing-banner-link"
+ target="_blank"
+ rel="noreferrer noopener"
+ href="/briefing/signup/?origin=Nature&amp;originReferralPoint=EmailBanner">Sign up for Nature Briefing
+ </a>
+ </div>
+
+ </div>
+
+</div>
+
+ </section>
+</div>
+ <script>
+ window.onload = function () {
+ Array.prototype.slice.call(document.querySelectorAll(".magazine-infographic > iframe"))
+ .forEach(function (element) {
+ function listener(event) {
+ if (event.data.height) {
+ if (element.id === event.data.requestData.id) {
+ element.setAttribute("height", event.data.height)
+ }
+ }
+ }
+
+ window.addEventListener("message", listener);
+ element.contentWindow.postMessage({name: "getHeight", id: element.id}, "*");
+ });
+ }
+ </script>
+ <script>
+ var linkEl = document.querySelector('.js-ctm');
+ if (linkEl && window.matchMedia && window.matchMedia(linkEl.media).matches) {
+ var fragment = document.createDocumentFragment();
+ var polyfillScript = document.createElement('script');
+ var header150Script = null;
+ var appScript = document.createElement('script');
+ var sharedEs6Script = document.createElement('script');
+
+ polyfillScript.src = 'https://cdn.polyfill.io/v2/polyfill.min.js?features=default,IntersectionObserver,Array.prototype.includes,Promise';
+ polyfillScript.async = false;
+ fragment.appendChild(polyfillScript);
+
+ appScript.src = '/static/js/magazine/magazine-mosaic.71d8740808.js';
+ appScript.async = false;
+ fragment.appendChild(appScript);
+
+ sharedEs6Script.src = '/static/js/shared-es6-bundle.c83ed51f05.js';
+ sharedEs6Script.async = false;
+ fragment.appendChild(sharedEs6Script);
+
+ header150Script = document.createElement('script');
+ header150Script.src = '/static/js/header-150-bundle.aaea96385f.js';
+ header150Script.async = false;
+ fragment.appendChild(header150Script);
+
+ document.body.appendChild(fragment);
+ }
+ </script>
+ <script>
+ var idp = {
+ hasNatureUserProof: function (hasProof) {
+ if (!hasProof) {
+ document.getElementById("my-account").setAttribute("style", "display: none;");
+ document.getElementById("login-button").setAttribute("style", "");
+ }
+ }
+ }
+ </script>
+ <script src="https://verify.nature.com/verify/nature.min.js"></script>
+ <noscript>
+ <img src="https://verify.nature.com/verify/nature.png" alt="" width="0" height="0"/>
+ </noscript>
+
+
+
+ <nav class="u-hide-print c-header-expander" aria-labelledby="Explore-our-content" data-test="Explore-our-content" id="explore" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <div class="c-header-expander__keyline">
+ <h2 id="Explore-our-content" class="c-header-expander__heading u-js-hide">Explore our content</h2>
+ <ul class="c-header-expander__list">
+
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/research"
+ data-track="click"
+ data-track-action="research"
+ data-track-label="link">
+ Research
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/news"
+ data-track="click"
+ data-track-action="news"
+ data-track-label="link">
+ News
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/opinion"
+ data-track="click"
+ data-track-action="opinion"
+ data-track-label="link">
+ Opinion
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/research-analysis"
+ data-track="click"
+ data-track-action="research analysis"
+ data-track-label="link">
+ Research Analysis
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/careers"
+ data-track="click"
+ data-track-action="careers"
+ data-track-label="link">
+ Careers
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/books-culture"
+ data-track="click"
+ data-track-action="books and culture"
+ data-track-label="link">
+ Books and Culture
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/podcast"
+ data-track="click"
+ data-track-action="podcasts"
+ data-track-label="link">
+ Podcasts
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/videoarchive"
+ data-track="click"
+ data-track-action="videos"
+ data-track-label="link">
+ Videos
+ </a>
+ </li>
+
+
+
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/current-issue"
+ data-track="click"
+ data-track-action="current issue"
+ data-track-label="link">
+ Current Issue
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/browse-issues"
+ data-track="click"
+ data-track-action="browse issues"
+ data-track-label="link">
+ Browse Issues
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/articles"
+ data-track="click"
+ data-track-action="browse articles"
+ data-track-label="link">
+ Browse Articles
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/collections"
+ data-track="click"
+ data-track-action="browse collections"
+ data-track-label="link">
+ Browse Collections
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/browse-subjects"
+ data-track="click"
+ data-track-action="browse subjects"
+ data-track-label="link">
+ Browse Subjects
+ </a>
+ </li>
+
+
+
+ <li class="c-header-expander__item c-header-expander__item--keyline">
+ <a class="c-header-expander__link"
+ href="https://www.nature.com/my-account/alerts/subscribe-journal?list-id&#x3D;1"
+ data-track="click"
+ data-track-action="Sign up for alerts"
+ data-track-label="link">Sign up for alerts<svg role="img" aria-hidden="true" focusable="false" height="18" viewBox="0 0 18 18" width="18" xmlns="http://www.w3.org/2000/svg"><path d="m4 10h2.5c.27614237 0 .5.2238576.5.5s-.22385763.5-.5.5h-3.08578644l-1.12132034 1.1213203c-.18753638.1875364-.29289322.4418903-.29289322.7071068v.1715729h14v-.1715729c0-.2652165-.1053568-.5195704-.2928932-.7071068l-1.7071068-1.7071067v-3.4142136c0-2.76142375-2.2385763-5-5-5-2.76142375 0-5 2.23857625-5 5zm3 4c0 1.1045695.8954305 2 2 2s2-.8954305 2-2zm-5 0c-.55228475 0-1-.4477153-1-1v-.1715729c0-.530433.21071368-1.0391408.58578644-1.4142135l1.41421356-1.4142136v-3c0-3.3137085 2.6862915-6 6-6s6 2.6862915 6 6v3l1.4142136 1.4142136c.3750727.3750727.5857864.8837805.5857864 1.4142135v.1715729c0 .5522847-.4477153 1-1 1h-4c0 1.6568542-1.3431458 3-3 3-1.65685425 0-3-1.3431458-3-3z" fill="#fff"/></svg>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+ </div>
+ </nav>
+
+
+
+ <nav class="u-hide-print c-header-expander" aria-labelledby="Journal-information" id="journal-info" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <div class="c-header-expander__keyline">
+ <h2 id="Journal-information" class="c-header-expander__heading u-js-hide">Journal information</h2>
+ <ul class="c-header-expander__list">
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/about"
+ data-track="click"
+ data-track-action="about the journal"
+ data-track-label="link">
+ About the Journal
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/for-authors"
+ data-track="click"
+ data-track-action="for authors"
+ data-track-label="link">
+ For Authors
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/for-referees"
+ data-track="click"
+ data-track-action="for referees"
+ data-track-label="link">
+ For Referees
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/awards"
+ data-track="click"
+ data-track-action="awards"
+ data-track-label="link">
+ Awards
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/subscribe"
+ data-track="click"
+ data-track-action="subscribe"
+ data-track-label="link">
+ Subscribe
+ </a>
+ </li>
+
+
+ <li class="c-header-expander__item c-header-expander__item--keyline">
+ <a class="c-header-expander__link"
+ href="http://mts-nature.nature.com/"
+ data-track="click"
+ data-track-action="Submit manuscript"
+ data-track-label="link">Submit manuscript<svg role="img" aria-hidden="true" focusable="false" height="18" viewBox="0 0 18 18" width="18" xmlns="http://www.w3.org/2000/svg"><path d="m15 0c1.1045695 0 2 .8954305 2 2v5.5c0 .27614237-.2238576.5-.5.5s-.5-.22385763-.5-.5v-5.5c0-.51283584-.3860402-.93550716-.8833789-.99327227l-.1166211-.00672773h-9v3c0 1.1045695-.8954305 2-2 2h-3v10c0 .5128358.38604019.9355072.88337887.9932723l.11662113.0067277h7.5c.27614237 0 .5.2238576.5.5s-.22385763.5-.5.5h-7.5c-1.1045695 0-2-.8954305-2-2v-10.17157288c0-.53043297.21071368-1.0391408.58578644-1.41421356l3.82842712-3.82842712c.37507276-.37507276.88378059-.58578644 1.41421356-.58578644zm-.5442863 8.18867991 3.3545404 3.35454039c.2508994.2508994.2538696.6596433.0035959.909917-.2429543.2429542-.6561449.2462671-.9065387-.0089489l-2.2609825-2.3045251.0010427 7.2231989c0 .3569916-.2898381.6371378-.6473715.6371378-.3470771 0-.6473715-.2852563-.6473715-.6371378l-.0010428-7.2231995-2.2611222 2.3046654c-.2531661.2580415-.6562868.2592444-.9065605.0089707-.24295423-.2429542-.24865597-.6576651.0036132-.9099343l3.3546673-3.35466731c.2509089-.25090888.6612706-.25227691.9135302-.00001728zm-.9557137-3.18867991c.2761424 0 .5.22385763.5.5s-.2238576.5-.5.5h-6c-.27614237 0-.5-.22385763-.5-.5s.22385763-.5.5-.5zm-8.5-3.587-3.587 3.587h2.587c.55228475 0 1-.44771525 1-1zm8.5 1.587c.2761424 0 .5.22385763.5.5s-.2238576.5-.5.5h-6c-.27614237 0-.5-.22385763-.5-.5s.22385763-.5.5-.5z" fill="#fff"/></svg>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+ </div>
+ </nav>
+
+
+
+
+
+ <div id="search-menu" class="c-header-expander c-header-expander--tray u-hide-print" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <h2 class="u-visually-hidden">Search</h2>
+ <div data-test="inline-search">
+ <div class="c-header-expander__keyline u-mb-16">
+ <form action="/search"
+ method="get"
+ role="search"
+ class="c-header-expander__form"
+ autocomplete="off"
+ data-dynamic-track-label
+ data-track="submit" data-track-action="search" data-track-label="form">
+ <label class="c-header-expander__heading" for="keywords">Article Search</label>
+ <div class="c-form-field u-display-flex">
+ <input type="text"
+ class="c-form-field__input u-flex-shrink"
+ id="keywords"
+ name="q"
+ value=""
+ placeholder="Search by keywords or author"
+ data-test="search-keywords">
+ <button type="submit" class="c-button c-button--contrast u-flex-static u-ml-8" data-test="search-submit">Search</button>
+ </div>
+ <p class="u-ma-0">
+ <a href="/search/advanced"
+ data-track="click" data-track-action="advanced search" data-track-label="link">
+ Advanced search
+ </a>
+ </p>
+ </form>
+ </div>
+ <div class="c-header-expander__keyline">
+ <h3 class="c-header-expander__heading">Quick links</h3>
+ <ul class="u-list-reset">
+ <li class="u-display-inline-block u-mr-24"><a href="/subjects" data-track="click" data-track-action="explore articles by subject" data-track-label="link">Explore articles by subject</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/naturecareers" data-track="click" data-track-action="find a job" data-track-label="link">Find a job</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/authors/index.html" data-track="click" data-track-action="guide to authors" data-track-label="link">Guide to authors</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/authors/editorial_policies/" data-track="click" data-track-action="editorial policies" data-track-label="link">Editorial policies</a></li>
+ </ul>
+ </div>
+ </div>
+ </div>
+ </div>
+
+
+
+
+<footer role="contentinfo" class="composite-layer">
+ <div class="u-mt-16 u-mb-16">
+ <div class="u-container">
+ <div class="u-display-flex u-flex-wrap u-justify-content-space-between">
+ <p class="c-meta u-ma-0 u-mr-24">
+
+</p>
+
+ <p class="c-meta u-ma-0">
+ <span aria-level="2" class="c-meta__item" itemprop="name">
+ Nature
+ </span>
+ <span class="c-meta__item">
+ <abbr title="International Standard Serial Number">ISSN</abbr> <span itemprop="issn">1476-4687</span> (online)
+ </span>
+ </p>
+ </div>
+ </div>
+</div>
+
+
+ <div itemscope itemtype="http://schema.org/Periodical">
+ <meta itemprop="publisher" content="Springer Nature">
+ <div class="c-footer">
+ <div class="u-container">
+ <div class="u-hide-print" data-track-component="footer">
+ <h2 aria-level="2" class="u-visually-hidden">nature.com sitemap</h2>
+ <div class="c-footer__header">
+ <div class="c-footer__logo">
+ <img alt="Nature Research" src="/static/images/logos/nature research-white-150.f4acf77e0c.svg" loading="lazy" width="200" height="26">
+ </div>
+ <ul class="c-menu c-menu--inherit u-mr-32">
+ <li class="c-menu__item"><a class="c-menu__link" href="https://www.nature.com/npg_/company_info/index.html" data-track="click" data-track-action="about us" data-track-label="link">About us</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://www.nature.com/npg_/press_room/press_releases.html" data-track="click" data-track-action="press releases" data-track-label="link">Press releases</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://press.nature.com/" data-track="click" data-track-action="press office" data-track-label="link">Press office</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://support.nature.com/support/home" data-track="click" data-track-action="contact us" data-track-label="link">Contact us</a></li>
+ </ul>
+ <ul class="c-menu c-menu--inherit">
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://www.facebook.com/nature/" aria-label="Nature on Facebook" data-track="click" data-track-action="facebook" data-track-label="link">
+ <svg class="u-icon u-mt-2 u-mb-2" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 20 20"><path d="M2.5 20C1.1 20 0 18.9 0 17.5v-15C0 1.1 1.1 0 2.5 0h15C18.9 0 20 1.1 20 2.5v15c0 1.4-1.1 2.5-2.5 2.5h-3.7v-7.7h2.6l.4-3h-3v-2c0-.9.2-1.5 1.5-1.5h1.6V3.1c-.3 0-1.2-.1-2.3-.1-2.3 0-3.9 1.4-3.9 4v2.2H8.1v3h2.6V20H2.5z"/></svg>
+ </a>
+ </li>
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://twitter.com/nresearchnews?lang=en" aria-label="Nature on Twitter" data-track="click" data-track-action="twitter" data-track-label="link">
+ <svg class="u-icon" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20"><path d="M17.6 4.1c.8-.5 1.5-1.4 1.8-2.4-.8.5-1.7.9-2.6 1-.7-.8-1.8-1.4-3-1.4-2.3 0-4.1 1.9-4.1 4.3 0 .3 0 .7.1 1-3.4 0-6.4-1.8-8.4-4.4C1 2.9.8 3.6.8 4.4c0 1.5.7 2.8 1.8 3.6C2 8 1.4 7.8.8 7.5v.1c0 2.1 1.4 3.8 3.3 4.2-.3.1-.7.2-1.1.2-.3 0-.5 0-.8-.1.5 1.7 2 3 3.8 3-1.3 1.1-3.1 1.8-5 1.8-.3 0-.7 0-1-.1 1.8 1.2 4 1.9 6.3 1.9C13.8 18.6 18 12 18 6.3v-.6c.8-.6 1.5-1.4 2-2.2-.7.3-1.5.5-2.4.6z"/></svg>
+ </a>
+ </li>
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://www.youtube.com/channel/UCvCLdSgYdSTpWcOgEJgi-ng" aria-label="Nature on YouTube" data-track="click" data-track-action="youtube" data-track-label="link">
+ <svg class="u-icon" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20"><path d="M7.9 12.6V6.9l5.4 2.8c0 .1-5.4 2.9-5.4 2.9zM19.8 6s-.2-1.4-.8-2c-.8-.8-1.6-.8-2-.9-2.8-.2-7-.2-7-.2s-4.2 0-7 .2c-.4 0-1.2 0-2 .9-.6.6-.8 2-.8 2S0 7.6 0 9.2v1.5c0 1.7.2 3.3.2 3.3s.2 1.4.8 2c.8.8 1.8.8 2.2.9 1.6.1 6.8.2 6.8.2s4.2 0 7-.2c.4 0 1.2-.1 2-.9.6-.6.8-2 .8-2s.2-1.6.2-3.3V9.2c0-1.6-.2-3.2-.2-3.2z"/></svg>
+ </a>
+ </li>
+ </ul>
+ </div>
+
+ <div class="c-footer__grid">
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Discover content</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/siteindex" data-track="click" data-track-action="journals a-z" data-track-label="link">Journals A-Z</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/subjects/" data-track="click" data-track-action="article by subject" data-track-label="link">Articles by subject</a></li>
+ <li class="c-footer__item"><a href="https://nano.nature.com/" data-track="click" data-track-action="nano" data-track-label="link">Nano</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/protocolexchange/" data-track="click" data-track-action="protocol exchange" data-track-label="link">Protocol Exchange</a></li>
+ <li class="c-footer__item"><a href="https://www.natureindex.com/" data-track="click" data-track-action="nature index" data-track-label="link">Nature Index</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Publish with us</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/author_resources/index.html" data-track="click" data-track-action="guide to authors" data-track-label="link">Guide to Authors</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/peer_review/" data-track="click" data-track-action="guide to referees" data-track-label="link">Guide to Referees</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/editorial_policies/" data-track="click" data-track-action="editorial policies" data-track-label="link">Editorial policies</a></li>
+ <li class="c-footer__item"><a href="http://www.nature.com/openresearch/publishing-with-npg/" data-track="click" data-track-action="open access" data-track-label="link">Open access</a></li>
+ <li ><a href="https://www.nature.com/reprints/" data-track="click" data-track-action="reprints and permissions" data-track-label="link">Reprints &amp; permissions</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Researcher services</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/authors/research-data" data-track="click" data-track-action="data research service" data-track-label="link">Research data</a></li>
+ <li class="c-footer__item"><a href="https://authorservices.springernature.com/go/nr" data-track="click" data-track-action="language editing" data-track-label="link">Language editing</a></li>
+ <li class="c-footer__item"><a href="https://authorservices.springernature.com/scientific-editing/" data-track="click" data-track-action="scientific editing" data-track-label="link">Scientific editing</a></li>
+ <li class="c-footer__item"><a href="https://masterclasses.nature.com/" data-track="click" data-track-action="nature masterclasses" data-track-label="link">Nature Masterclasses</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/researcher-training/" data-track="click" data-track-action="nature research academies" data-track-label="link">Nature Research Academies</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Libraries &amp; institutions</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/librarians/tools-services" data-track="click" data-track-action="librarian service and tools" data-track-label="link">Librarian service &amp; tools</a></li>
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/librarians/manage-your-account/librarianportal" data-track="click" data-track-action="librarian portal" data-track-label="link">Librarian portal</a></li>
+ <li class="c-footer__item"><a href="http://www.nature.com/openresearch/about-open-access/information-for-institutions/" data-track="click" data-track-action="open research" data-track-label="link">Open research</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Advertising &amp; partnerships</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/digital-advertising/" data-track="click" data-track-action="advertising" data-track-label="link">Advertising</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/" data-track="click" data-track-action="partnerships and services" data-track-label="link">Partnerships &amp; Services</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/media-kits/" data-track="click" data-track-action="media kits" data-track-label="link">Media kits</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/branded-content-native-advertising/" data-track-action="branded content" data-track-label="link">Branded content</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Career development</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/naturecareers" data-track="click" data-track-action="nature careers" data-track-label="link">Nature Careers</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/natureconferences/" data-track="click" data-track-action="nature conferences" data-track-label="link">Nature<span class="visually-hidden"> </span> Conferences</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/natureevents/" data-track="click" data-track-action="nature events" data-track-label="link">Nature<span class="visually-hidden"> </span> events</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Regional websites</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="http://www.naturechina.com" data-track="click" data-track-action="nature china" data-track-label="link">Nature China</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/nindia" data-track="click" data-track-action="nature india" data-track-label="link">Nature India</a></li>
+ <li class="c-footer__item"><a href="https://www.natureasia.com/ja-jp/" data-track="click" data-track-action="nature japan" data-track-label="link">Nature Japan</a></li>
+ <li class="c-footer__item"><a href="https://www.natureasia.com/ko-kr/" data-track="click" data-track-action="nature korea" data-track-label="link">Nature Korea</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/nmiddleeast/" data-track="click" data-track-action="nature middle east" data-track-label="link">Nature Middle East</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Legal &amp; Privacy</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/info/privacy.html" data-track="click" data-track-action="privacy policy" data-track-label="link">Privacy Policy</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/cookies.html" data-track="click" data-track-action="use of cookies" data-track-label="link">Use of cookies</a></li>
+ <li class="c-footer__item"><a class="optanon-toggle-display" href="javascript:;" data-track="click" data-track-action="manage cookies" data-track-label="link">Manage cookies/Do not sell my data</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/legal_notice.html" data-track="click" data-track-action="legal notice" data-track-label="link">Legal notice</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/accessibility_statement.html" data-track="click" data-track-action="accessibility statement" data-track-label="link">Accessibility statement</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/tandc.html" data-track="click" data-track-action="terms and conditions" data-track-label="link">Terms &amp; Conditions</a></li>
+ <li class="c-footer__item"><a href="https://www.springernature.com/ccpa" data-track="click" data-track-action="california privacy statement" data-track-label="link">California Privacy Statement</a></li>
+ </ul>
+ </div>
+ </div>
+</div>
+
+
+ </div>
+ </div>
+ </div>
+
+ <div class="c-corporate-footer">
+ <div class="u-container">
+ <img src="/static/images/logos/sn-logo-white.ea63208b81.svg" alt="Springer Nature" loading="lazy" width="140" height="14"/>
+ <p class="c-corporate-footer__legal" data-test="copyright">&copy; 2020 Springer Nature Limited</p>
+ </div>
+</div>
+
+
+ <svg class="u-hide hide">
+ <symbol id="global-icon-chevron-right" viewBox="0 0 16 16">
+ <path d="M7.782 7L5.3 4.518c-.393-.392-.4-1.022-.02-1.403a1.001 1.001 0 011.417 0l4.176 4.177a1.001 1.001 0 010 1.416l-4.176 4.177a.991.991 0 01-1.4.016 1 1 0 01.003-1.42L7.782 9l1.013-.998z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-download" viewBox="0 0 16 16">
+ <path d="M2 14c0-.556.449-1 1.002-1h9.996a.999.999 0 110 2H3.002A1.006 1.006 0 012 14zM9 2v6.8l2.482-2.482c.392-.392 1.022-.4 1.403-.02a1.001 1.001 0 010 1.417l-4.177 4.177a1.001 1.001 0 01-1.416 0L3.115 7.715a.991.991 0 01-.016-1.4 1 1 0 011.42.003L7 8.8V2c0-.55.444-.996 1-.996.552 0 1 .445 1 .996z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-email" viewBox="0 0 18 18">
+ <path d="M1.995 2h14.01A2 2 0 0118 4.006v9.988A2 2 0 0116.005 16H1.995A2 2 0 010 13.994V4.006A2 2 0 011.995 2zM1 13.994A1 1 0 001.995 15h14.01A1 1 0 0017 13.994V4.006A1 1 0 0016.005 3H1.995A1 1 0 001 4.006zM9 11L2 7V5.557l7 4 7-4V7z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-institution" viewBox="0 0 18 18">
+ <path d="M14 8a1 1 0 011 1v6h1.5a.5.5 0 01.5.5v.5h.5a.5.5 0 01.5.5V18H0v-1.5a.5.5 0 01.5-.5H1v-.5a.5.5 0 01.5-.5H3V9a1 1 0 112 0v6h8V9a1 1 0 011-1zM6 8l2 1v4l-2 1zm6 0v6l-2-1V9zM9.573.401l7.036 4.925A.92.92 0 0116.081 7H1.92a.92.92 0 01-.528-1.674L8.427.401a1 1 0 011.146 0zM9 2.441L5.345 5h7.31z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-search" viewBox="0 0 22 22">
+ <path fill-rule="evenodd" d="M21.697 20.261a1.028 1.028 0 01.01 1.448 1.034 1.034 0 01-1.448-.01l-4.267-4.267A9.812 9.811 0 010 9.812a9.812 9.811 0 1117.43 6.182zM9.812 18.222A8.41 8.41 0 109.81 1.403a8.41 8.41 0 000 16.82z"/>
+ </symbol>
+ <symbol id="global-icon-info" viewBox="0 0 18 18">
+ <path d="m9 0c4.9705627 0 9 4.02943725 9 9 0 4.9705627-4.0294373 9-9 9-4.97056275 0-9-4.0294373-9-9 0-4.97056275 4.02943725-9 9-9zm0 7h-1.5l-.11662113.00672773c-.49733868.05776511-.88337887.48043643-.88337887.99327227 0 .47338693.32893365.86994729.77070917.97358929l.1126697.01968298.11662113.00672773h.5v3h-.5l-.11662113.0067277c-.42082504.0488782-.76196299.3590206-.85696816.7639815l-.01968298.1126697-.00672773.1166211.00672773.1166211c.04887817.4208251.35902055.761963.76398144.8569682l.1126697.019683.11662113.0067277h3l.1166211-.0067277c.4973387-.0577651.8833789-.4804365.8833789-.9932723 0-.4733869-.3289337-.8699473-.7707092-.9735893l-.1126697-.019683-.1166211-.0067277h-.5v-4l-.00672773-.11662113c-.04887817-.42082504-.35902055-.76196299-.76398144-.85696816l-.1126697-.01968298zm0-3.25c-.69035594 0-1.25.55964406-1.25 1.25s.55964406 1.25 1.25 1.25 1.25-.55964406 1.25-1.25-.55964406-1.25-1.25-1.25z" fill-rule="evenodd"/>
+ </symbol>
+ </svg>
+
+</footer>
+
+
+</body>
+</html>
+
diff --git a/python/tests/files/peerj_oa_article.html b/python/tests/files/peerj_oa_article.html
new file mode 100644
index 0000000..f2cf365
--- /dev/null
+++ b/python/tests/files/peerj_oa_article.html
@@ -0,0 +1,2365 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+ <meta charset="utf-8">
+
+ <title>The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles [PeerJ]</title>
+
+
+ <link rel="dns-prefetch" href="https://d2pdyyx74uypu5.cloudfront.net/">
+ <link rel="dns-prefetch" href="http://static.peerj.com/">
+<link rel="dns-prefetch" href="https://doi.org">
+
+
+ <meta name="citation_title" content="The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"><meta name="citation_date" content="2018-02-13"><meta name="citation_doi" content="10.7717/peerj.4375"><meta name="citation_language" content="en"><meta name="citation_pdf_url" content="https://peerj.com/articles/4375.pdf"><meta name="citation_fulltext_html_url" content="https://peerj.com/articles/4375"><meta name="citation_volume" content="6"><meta name="citation_firstpage" content="e4375"><meta name="citation_keywords" content="Open access; Open science; Scientometrics; Publishing; Libraries; Scholarly communication; Bibliometrics; Science policy"><meta name="citation_journal_title" content="PeerJ"><meta name="citation_journal_abbrev" content="PeerJ"><meta name="citation_publisher" content="PeerJ Inc."><meta name="citation_issn" content="2167-8359"><meta name="citation_author" content="Heather Piwowar"><meta name="citation_author_institution" content="Impactstory, Sanford, NC, USA"><meta name="citation_author_email" content="heather@impactstory.org"><meta name="citation_author" content="Jason Priem"><meta name="citation_author_institution" content="Impactstory, Sanford, NC, USA"><meta name="citation_author_email" content="jason@impactstory.org"><meta name="citation_author" content="Vincent Larivière"><meta name="citation_author_institution" content="École de bibliothéconomie et des sciences de l’information, Université de Montréal, Montréal, QC, Canada"><meta name="citation_author_institution" content="Observatoire des Sciences et des Technologies (OST), Centre Interuniversitaire de Recherche sur la Science et la Technologie (CIRST), Université du Québec à Montréal, Montréal, QC, Canada"><meta name="citation_author" content="Juan Pablo Alperin"><meta name="citation_author_institution" content="Canadian Institute for Studies in Publishing, Simon Fraser University, Vancouver, BC, Canada"><meta name="citation_author_institution" content="Public Knowledge Project, Canada"><meta name="citation_author" content="Lisa Matthias"><meta name="citation_author_institution" content="Scholarly Communications Lab, Simon Fraser University, Vancouver, Canada"><meta name="citation_author" content="Bree Norlander"><meta name="citation_author_institution" content="Information School, University of Washington, Seattle, USA"><meta name="citation_author_institution" content="FlourishOA, USA"><meta name="citation_author" content="Ashley Farley"><meta name="citation_author_institution" content="Information School, University of Washington, Seattle, USA"><meta name="citation_author_institution" content="FlourishOA, USA"><meta name="citation_author" content="Jevin West"><meta name="citation_author_institution" content="Information School, University of Washington, Seattle, USA"><meta name="citation_author" content="Stefanie Haustein"><meta name="citation_author_institution" content="Observatoire des Sciences et des Technologies (OST), Centre Interuniversitaire de Recherche sur la Science et la Technologie (CIRST), Université du Québec à Montréal, Montréal, QC, Canada"><meta name="citation_author_institution" content="School of Information Studies, University of Ottawa, Ottawa, ON, Canada">
+ <meta name="description" content="Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.">
+
+
+ <meta property="og:image" content="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg">
+ <meta name="twitter:image" content="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg">
+
+ <meta name="twitter:card" content="summary_large_image">
+ <meta name="twitter:url" content="https://peerj.com/articles/4375">
+ <meta name="twitter:site" content="@thePeerJ">
+ <meta name="twitter:title" content="The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles">
+ <meta name="twitter:description" content="Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.">
+
+ <meta property="og:type" content="article">
+ <meta property="og:title" content="The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles">
+ <meta property="og:url" content="https://peerj.com/articles/4375">
+ <meta property="og:site_name" content="PeerJ">
+
+
+ <link rel="alternate" type="application/pdf" href="/articles/4375.pdf">
+ <link rel="alternate" type="application/rdf+xml" href="/articles/4375.rdf">
+ <link rel="alternate" type="application/json" href="/articles/4375.json">
+ <link rel="alternate" type="application/xml" href="/articles/4375.xml">
+ <link rel="alternate" type="application/unixref+xml" href="/articles/4375.unixref">
+ <link rel="alternate" type="application/vnd.citationstyles.csl+json" href="/articles/4375.citeproc">
+ <link rel="alternate" type="application/bibjson+json" href="/articles/4375.bibjson">
+ <link rel="alternate" type="text/html" href="/articles/4375.html">
+
+ <link rel="canonical" href="https://peerj.com/articles/4375/">
+
+ <meta name="viewport" content="width=device-width,initial-scale=1">
+ <meta property="fb:app_id" content="534542813234464">
+
+ <link rel="stylesheet" href="/css/05b9c3d-27443c7.css" media="screen">
+
+<!--[if lt IE 9]>
+ <link rel="stylesheet" href="/assets/css/ie8.css" media="screen">
+<![endif]-->
+
+<!--[if lt IE 10]>
+ <link rel="stylesheet" href="/assets/css/ie9.css" media="screen">
+<![endif]-->
+
+ <style media="screen">html, body { height: 100%; }</style>
+ <link rel="stylesheet" href="https://cdn.peerj.com/webpack/vue-bundle.2cdd25e1.css">
+
+
+ <link rel="stylesheet" href="/css/a0c1a2c-04690d8.css" media="screen">
+
+ <link rel="stylesheet" href="/css/be477b9-1134171.css" media="screen">
+ <link rel="stylesheet" href="/css/3e4ba6d-c134b5f.css" media="print">
+ <script src="/js/36e5d51-2d7025c.js"></script>
+<script src="/assets/js/polyfills/includes.js"></script>
+<script src="/assets/js/polyfills/startsWith.js"></script><!--[if lt IE 9]>
+<script src="/assets/js/html5shiv.js"></script>
+
+<![endif]-->
+
+<!--[if lt IE 8]>
+<script src="/assets/js/json2.js"></script>
+<![endif]-->
+
+<script>
+ var PeerJ = {
+ Article: {},
+ User: {
+ anonymous: true },
+ Publication: {},
+ Production: {},
+ Event: {},
+ Com: {},
+ Payment: {},
+ Annotation: {},
+ Search: {},
+ Home: {},
+ Subjects: {},
+ Advocacy: {},
+ Job: {},
+ ContentAlert: {},
+ Tools: {}
+ };
+</script>
+
+
+<script>
+ var campaign_keywords = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_content', 'utm_term'];
+ var kw = '';
+ var lastUtms = {};
+ var firstUtms = {};
+ var allUtms = {};
+
+ function campaignParams() {
+ var index;
+ for (index = 0; index < campaign_keywords.length; ++index) {
+ kw = getQueryParam(document.URL, campaign_keywords[index]);
+ if (kw.length) {
+ lastUtms[campaign_keywords[index] + '-last'] = kw;
+ firstUtms[campaign_keywords[index] + '-first'] = kw;
+ allUtms[campaign_keywords[index] + '-all'] = kw;
+ }
+ }
+ }
+
+ function updatePreregCookie(preregCookie, firstUtmKey) {
+ var utmVal = firstUtms[firstUtmKey];
+ if (utmVal) {
+ var existingPreregCampaign = $.cookie(preregCookie);
+ var appendPreregCampaign;
+ if (!existingPreregCampaign) {
+ appendPreregCampaign = utmVal;
+ } else {
+ appendPreregCampaign = existingPreregCampaign + ',' + utmVal;
+
+ }
+ $.cookie(preregCookie, appendPreregCampaign, {expires: 365, path: "/"});
+ }
+ }
+
+ function getQueryParam(url, param) {
+ // Expects a raw URL
+ param = param.replace(/[[]/, "\[").replace(/[]]/, "\]");
+ var regexS = "[\?&]" + param + "=([^&#]*)",
+ regex = new RegExp( regexS ),
+ results = regex.exec(url);
+ if (results === null || (results && typeof(results[1]) !== 'string' && results[1].length)) {
+ return '';
+ } else {
+ return decodeURIComponent(results[1]).replace(/\W/gi, ' ');
+ }
+ }
+
+ function articlePageEvent() {
+ var articleContainer = $('.publication-jsondata');
+ if (articleContainer.length) {
+ var data = articleContainer.data('publication-meta');
+
+ // Must be public
+ if (data.publicationSubjects.length) {
+
+ var eventName = 'Viewed-article';
+ var preprint = data.preprint;
+ if (preprint) {
+ eventName = 'Viewed-preprint';
+ }
+
+ data['ip-hash'] = 'bf3914b8088a79fb1fcf39cb526631c0';
+ mixpanel.track(eventName, data);
+ }
+ }
+ }
+
+ function sectionListViewEvent() {
+ }
+</script>
+ <script>
+ // User agrees to terms on signup, so Mixpanel is OK
+ // On submit, update mixpanel distinct id
+ setTimeout(function () {
+ var regmixpanel = document.getElementById('fos_user_registration_form_mixpanelId');
+ if (regmixpanel) {
+ var distinctId = $.cookie('pj_mp_distinct');
+ if (!distinctId) {
+ distinctId = mixpanel.get_distinct_id();
+ }
+ console.log(distinctId);
+ regmixpanel.value = distinctId;
+ }
+ }, 1500);
+
+ // If logged out then check if consented to analytics cookies (if applicable to country)
+ // Run through cookieConsent only
+ PeerJ.Com.Mixpanel = new function() {
+ this.leadView = function() {
+ mixpanel.init('776a79e14e8f05a81ca92536c83f08b4', {
+ 'secure_cookie': true,
+ loaded: function (mixpanel) {
+ setTimeout(function () {
+ articlePageEvent();
+
+ sectionListViewEvent();
+
+
+
+ }, 1000);
+ }
+ });
+ }
+ };
+
+ campaignParams();
+ updatePreregCookie('pj_prereg_campaign', 'utm_campaign-first');
+ updatePreregCookie('pj_prereg_content', 'utm_content-first');
+ updatePreregCookie('pj_prereg_term', 'utm_term-first');
+ </script>
+
+
+
+ <script>(function(p,u,s,h,x){p.pushpad=p.pushpad||function(){(p.pushpad.q=p.pushpad.q||[]).push(arguments)};h=u.getElementsByTagName('head')[0];x=u.createElement('script');x.async=1;x.src=s;h.appendChild(x);})(window,document,'https://pushpad.xyz/pushpad.js');
+pushpad('init', 5977, {hostname: 'peerj.com'});
+</script>
+
+ <link rel="search" type="application/opensearchdescription+xml" href="https://peerj.com/articles/osd.xml" title="PeerJ">
+
+
+
+
+
+ <script>
+ // Run through cookieConsent only
+ PeerJ.Com.GA = new function() {
+ this.disabletracking = function() {
+ window['ga-disable-' + 'UA-31208920-1'] = true;
+ };
+
+ this.runGA = function() {
+ (function (i, s, o, g, r, a, m) {
+ i['GoogleAnalyticsObject'] = r;
+ i[r] = i[r] || function () {
+ (i[r].q = i[r].q || []).push(arguments)
+ }, i[r].l = 1 * new Date();
+ a = s.createElement(o),
+ m = s.getElementsByTagName(o)[0];
+ a.async = 1;
+ a.src = g;
+ m.parentNode.insertBefore(a, m)
+ })(window, document, 'script', 'https://www.google-analytics.com/analytics.js', 'ga');
+
+ ga('create', 'UA\u002D31208920\u002D1', 'auto');
+
+ // Removes last octet
+ ga('set', 'anonymizeIp', true);
+
+
+
+
+
+
+
+
+
+ ga('set', 'dimension4', ';Legal\u0020Issues\u003BScience\u0020Policy\u003BData\u0020Science;');
+
+ ga('require', 'displayfeatures');
+
+ ga('send', 'pageview');
+
+ window.setTimeout(function () {
+ ga('send', 'event', 'adjusted bounce rate', 'page visit 15 seconds or more');
+ }, 15000);
+
+
+ }
+ };
+ </script>
+ <script src="/js/8548491-f0f5b7c.js"></script>
+
+<link rel="apple-touch-icon" sizes="57x57" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-57x57.png">
+<link rel="apple-touch-icon" sizes="60x60" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-60x60.png">
+<link rel="apple-touch-icon" sizes="72x72" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-72x72.png">
+<link rel="apple-touch-icon" sizes="76x76" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-76x76.png">
+<link rel="apple-touch-icon" sizes="114x114" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-114x114.png">
+<link rel="apple-touch-icon" sizes="120x120" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-120x120.png">
+<link rel="apple-touch-icon" sizes="144x144" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-144x144.png">
+<link rel="apple-touch-icon" sizes="152x152" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-152x152.png">
+<link rel="apple-touch-icon" sizes="180x180" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/apple-icon-180x180.png">
+<link rel="icon" type="image/png" sizes="192x192" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/android-icon-192x192.png">
+<link rel="shortcut icon" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon.ico">
+<link rel="icon" type="image/png" sizes="32x32" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon-32x32.png">
+<link rel="icon" type="image/png" sizes="96x96" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon-96x96.png">
+<link rel="icon" type="image/png" sizes="16x16" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/favicon-16x16.png">
+<link rel="manifest" href="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/manifest.json">
+<meta name="msapplication-TileColor" content="#ffffff">
+<meta name="msapplication-TileImage" content="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/ms-icon-144x144.png">
+<meta name="msapplication-config" content="https://d2pdyyx74uypu5.cloudfront.net/images/favicon/peerj/browserconfig.xml">
+<meta name="theme-color" content="#ffffff"></head>
+
+<body class="">
+
+ <!-- FreshDesk variable (TODO: move elsewhere) -->
+
+
+<nav class="navbar navbar-fixed-top navbar-inverse navbar-alpha" role="navigation"><div class="navbar-inner"><!-- .btn-navbar is used as the toggle for collapsed navbar content --><a class="btn btn-navbar pull-right" data-toggle="collapse" data-target=".nav-collapse"><span class="icon-bar"></span><span class="icon-bar"></span><span class="icon-bar"></span></a><!-- logo --><ul class="nav pull-left nav-sections nav-journal"><li class="dropdown"><a href="/" class="dropdown-toggle "
+ data-toggle="dropdown"><span id="navJournalTitle">PeerJ Journals</span><b class="caret"></b></a><ul class="dropdown-menu journal-list"><li><a href="/">PeerJ Publishing Overview</a></li><li class="dropdown-submenu hidden-phone"><a tabindex="-1" href="#">PeerJ – Life & Environment</a><ul class="dropdown-menu"><li><a href="/sections/">About the journal Sections</a></li><li class="divider"></li><li><a href="/sections/aquatic-biology/">Aquatic Biology</a></li><li><a href="/sections/biochemistry-biophysics-molecular-biology/">Biochemistry, Biophysics and Molecular Biology</a></li><li><a href="/sections/biodiversity-conservation/">Biodiversity and Conservation</a></li><li><a href="/sections/bioinformatics-genomics/">Bioinformatics and Genomics</a></li><li><a href="/sections/brain-cognition/">Brain and Cognition</a></li><li><a href="/sections/ecology/">Ecology</a></li><li><a href="/sections/environ-sci/">Environmental Science</a></li><li><a href="/sections/microbiology/">Microbiology</a></li><li><a href="/sections/paleontology-evolutionary-science/">Paleontology and Evolutionary Science</a></li><li><a href="/sections/plant-biology/">Plant Biology</a></li><li><a href="/sections/zoological-science/">Zoological Science</a></li></ul></li><li><a href="/computer-science/">
+ PeerJ Computer Science
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Physical Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Organic Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Inorganic Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Analytical Chemistry
+ </a></li><li><a href="https://peerj.com/chemistry/">
+ PeerJ Materials Science
+ </a></li><li class="divider"></li><li><a href="https://peerj.org/" target="_blank">Visit PeerJ.org and get involved</a></li></ul></li></ul><!-- mobile-only top nav items --><ul class="nav pull-left nav-about-phone hidden-desktop"><li class="dropdown"><a tabindex="-1" href="#" class="dropdown-toggle"
+ data-toggle="dropdown">About <b class="caret"></b></a><ul class="dropdown-menu"><li id="about-overview"><a href="/benefits/">PeerJ Journals Overview</a></li><li id="about-faq"><a href="/about/FAQ/">PeerJ Journals FAQ</a></li><li id="about-what-publish"><a href="/about/publications/">What we publish</a></li><li id="8yrs-publishing"><a href="/benefits/peerj-timeline/">8 Years publishing</a></li><li class="divider"></li><li role="presentation" class="dropdown-header">Solutions for authors</li><li id="about-reputation"><a href="/benefits/reputation/">Reputation</a></li><li id="about-peer-review"><a href="/benefits/peer-review-timeline/">High quality peer review</a></li><li id="about-speed"><a href="/benefits/fast-publishing/">Fast publishing</a></li><li id="about-impact"><a href="/benefits/indexing-and-impact-factor/">Indexing and Impact Factor</a></li><li id="about-readership"><a href="/benefits/broad-audience/">Global readership</a></li><li id="about-features"><a href="/benefits/peerj-feature-comparison/">Feature comparison</a></li><li id="about-cost"><a href="/benefits/reduced-cost-publishing/">Reduced cost publishing</a></li><li id="about-feedback"><a href="/benefits/feedback/">Author feedback</a></li><li id="about-ecr-benefits"><a href="/benefits/early-career-researchers/">Early career researcher benefits</a></li><li id="about-senior-researcher-benefits"><a href="/benefits/senior-researchers/">Senior researcher benefits</a></li><li id="about-open-review"><a href="/benefits/review-history-and-peer-review/">Open review (optional)</a></li><li id="about-rebuttal"><a href="/benefits/academic-rebuttal-letters/">Rebuttal letters</a></li></ul></li><li><!-- checkout items --></li><li><!-- notifications --></li></ul><!-- sections --><ul class="nav pull-left nav-collapse nav-sections nav-sections-main collapse search-hide"><li class="dropdown visible-desktop"><a tabindex="-1" href="#" class="dropdown-toggle"
+ data-toggle="dropdown">About <b class="caret"></b></a><ul class="dropdown-menu"><li id="about-overview"><a href="/benefits/">PeerJ Journals Overview</a></li><li id="about-faq"><a href="/about/FAQ/">PeerJ Journals FAQ</a></li><li id="about-what-publish"><a href="/about/publications/">What we publish</a></li><li id="8yrs-publishing"><a href="/benefits/peerj-timeline/">8 Years publishing</a></li><li class="divider"></li><li role="presentation" class="dropdown-header">Solutions for authors</li><li id="about-reputation"><a href="/benefits/reputation/">Reputation</a></li><li id="about-peer-review"><a href="/benefits/peer-review-timeline/">High quality peer review</a></li><li id="about-speed"><a href="/benefits/fast-publishing/">Fast publishing</a></li><li id="about-impact"><a href="/benefits/indexing-and-impact-factor/">Indexing and Impact Factor</a></li><li id="about-readership"><a href="/benefits/broad-audience/">Global readership</a></li><li id="about-features"><a href="/benefits/peerj-feature-comparison/">Feature comparison</a></li><li id="about-cost"><a href="/benefits/reduced-cost-publishing/">Reduced cost publishing</a></li><li id="about-feedback"><a href="/benefits/feedback/">Author feedback</a></li><li id="about-ecr-benefits"><a href="/benefits/early-career-researchers/">Early career researcher benefits</a></li><li id="about-senior-researcher-benefits"><a href="/benefits/senior-researchers/">Senior researcher benefits</a></li><li id="about-open-review"><a href="/benefits/review-history-and-peer-review/">Open review (optional)</a></li><li id="about-rebuttal"><a href="/benefits/academic-rebuttal-letters/">Rebuttal letters</a></li></ul></li><!-- more --><li class="dropdown"><a href="#" class="dropdown-toggle"
+ data-toggle="dropdown">More <b class="caret"></b></a><ul class="dropdown-menu" role="menu" aria-labelledby="dLabel"><li><a href="/expertrxiv/"><img src="/assets/images/icons/expertrxiv.png" style="width: 80px"/></a></li><li><a href="/subjects/">Subjects</a></li><li class="dropdown-submenu hidden-phone"><a tabindex="-1" href="#">Search articles</a><ul class="dropdown-menu"><li role="presentation" class="dropdown-header">Peer-reviewed Journals</li><li><a tabindex="-1" href="/articles/?journal=peerj">PeerJ (Life, Biological, Environmental and Health Sciences)</a></li><li><a tabindex="-1" href="/articles/?journal=cs">PeerJ Computer Science</a></li><li><a tabindex="-1" href="/articles/?journal=pchem">PeerJ Physical Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=ochem">PeerJ Organic Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=ichem">PeerJ Inorganic Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=achem">PeerJ Analytical Chemistry</a></li><li><a tabindex="-1" href="/articles/?journal=matsci">PeerJ Materials Science</a></li><li role="presentation" class="dropdown-header">Preprints</li><li><a tabindex="-1" href="/preprints/">PeerJ Preprints</a></li></ul></li><li class="dropdown-submenu hidden-phone"><a tabindex="-1" href="#">Table of contents</a><ul class="dropdown-menu"><li role="presentation" class="dropdown-header">Table of Contents - current and archives</li><li><a tabindex="-1" href="/medicine/">PeerJ - Medicine articles</a></li><li><a tabindex="-1" href="/biology/">PeerJ - Biology & Life science articles</a></li><li><a tabindex="-1" href="/environment/">PeerJ - Environmental Science articles</a></li><li><a tabindex="-1" href="/general/">PeerJ - General bio (stats, legal, policy, edu)</a></li><li class="divider"></li><li><a tabindex="-1" href="/cs/">PeerJ Computer Science</a></li><li class="divider"></li><li><a tabindex="-1" href="/preprints-toc/">PeerJ Preprints</a></li></ul></li><li><a href="/academic-boards/advisors/">Academic advisors</a></li><li><a href="/reviewer-match/">Volunteer to review</a></li><li><a href="/collections/">Collections</a></li><li><a href="/questions/">Discussions</a></li><li><a href="https://peerj.com/blog/">Blog</a></li><li><a href="/prepaid-publishing/">Prepaid Publishing</a></li><li><a href="/about/reviews/">Reviews and awards</a></li><li><a href="/spread-the-word/">Spread the word</a></li><li><a href="/about/">Who are we?</a></li><li><a href="/about/contact/">Contact</a></li></ul></li></ul><!-- search --><div class="nav nav-collapse collapse pull-right nav-search"><form class="navbar-search" action="/search/"><input name="q" type="search"
+ data-autocomplete-url="/search/"
+ class="search-query" placeholder="Search"><!--<i class="icon-search"></i>--></form></div><ul class="nav pull-right nav-collapse collapse search-hide nav-utilities"><!-- login desktop --><li><a id="front-page-login" href="/login">Login</a></li></ul><ul class="nav pull-right search-hide nav-shifter"></ul><!-- for authors, my manuscripts --><ul class="nav nav-center nav-collapse collapse search-hide pull-right"><!-- for authors --><li class="dropdown nav-authors"><a href="#" class="dropdown-toggle" data-toggle="dropdown"><i
+ class="icon-info4 icon-large nav-icon icomoon"></i><span class="visible-wide">AUTHORS</span><b class="caret"></b></a><ul class="dropdown-menu"><li><a href="/benefits/">Peer Journals Overview</a></li><li><a href="/about/author-instructions/">Submission Guidelines</a></li><li><a href="/subjects/">Subject Areas</a></li><li><a href="/academic-boards/">Editorial Board</a></li><li><a href="/about/editorial-criteria/">Editorial Criteria</a></li><li><a href="/pricing/">Pricing</a></li><li><a href="/about/FAQ/">General FAQ</a></li><li><a href="/computer-science/faq-cs/">Computer Science FAQ</a></li><li><a href="/about/aims-and-scope/">Aims and Scope</a></li><li><a href="/about/author-interviews/">Author Interviews</a></li><li><a href="/about/policies-and-procedures/">Policies and Procedures</a></li><!--<li><a href="#">Why PeerJ?</a></li>--></ul></li><!-- my manuscripts --><!-- note: dropdown classes used just to maintain display --><li class="nav-manuscripts dropdown"><a href="/new/" class="dropdown-toggle"><span>SUBMIT ARTICLE</span></a></li></ul></div></nav>
+
+ <div class="item-top-navbar">
+ <div class="item-top-navbar-inner">
+ <div class="container-fluid">
+ <div class="row-fluid">
+ <div class="span12">
+ <div class="item-metrics-counts-top-nav article-item-metrics-counts">
+ <span class="article-item-metrics-count visible-all">
+ <span data-count="citations">203</span>
+ <span class="article-item-metrics-label">Citations</span>
+ </span>
+
+ <span class="article-item-metrics-count">
+ <span data-count="views-html">&nbsp;</span>
+ <span class="article-item-metrics-label">Views</span>
+ </span>
+
+ <span class="article-item-metrics-count">
+ <span data-count="views-pdf">&nbsp;</span>
+ <span class="article-item-metrics-label">Downloads</span>
+ </span>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+</div>
+
+ <div id="wrap">
+
+
+
+ <div id="nav-pad"></div>
+
+
+ <div class="container">
+
+ <noscript class="js-disabled-warning">
+ <div class="alert alert-danger">
+ <i class="icon icon-warning-sign"></i> Javascript is disabled in your browser. Please <a href="https://www.enable-javascript.com" target="_blank">enable Javascript</a> to view PeerJ.
+ </div>
+ </noscript>
+
+
+ <div class="row publication-jsondata" data-publication-meta="{&quot;publicationId&quot;:&quot;4375&quot;,&quot;Article-section&quot;:&quot;NA&quot;,&quot;journal&quot;:&quot;PeerJ&quot;,&quot;published&quot;:&quot;2018-02-13 08:54:18&quot;,&quot;preprint&quot;:false,&quot;publicationSubjects&quot;:[&quot;Legal Issues&quot;,&quot;Science Policy&quot;,&quot;Data Science&quot;],&quot;publicationInstitutions&quot;:[&quot;Simon Fraser University&quot;,&quot;University of Washington&quot;,&quot;University of Ottawa&quot;],&quot;publicationTop20Institution&quot;:true,&quot;publicationInstitutionPlan&quot;:true}">
+ <!-- Left sidebar -->
+ <div class="span1 article-sidebar">
+ <div class="article-sidebar-left">
+ <div class="sidebar-box sidebar-box--journal">
+ <a href="/" class="sidebar-box--journal-mask"></a>
+ <img src="https://d2pdyyx74uypu5.cloudfront.net/images/article/logos/article-logo-peerj.png">
+ </div>
+
+ <div id="btn-view-tweets" class="sidebar-box sidebar-box--tweet">
+ <div class="text-center">View 618 tweets <i class="icon-twitter"></i></div>
+ </div>
+
+ <a href="#related-research" class="sidebar-box sidebar-box--related text-center">
+ Related research
+ <i class="icon-angle-down"></i>
+ </a>
+
+ <!-- mobile only -->
+ <div class="item-leftside-actions">
+ <div class="sidebar-box sidebar-box--action js-download-modal-trigger">Download</div>
+
+ <div id="notification-actions-mobile" class="sidebar-box sidebar-box--action" data-href="/following/publication/4522/">
+ <span class="follow-btn " id="item-left-follow-btn"
+ title="Receive article updates" data-toggle="tooltip" data-success-modal="#followModal"
+ data-href="/follow/publication/4522/0/">
+ <span class="button_text_follow">Follow</span class="follow-btn publication-label publication-label-general publication-label-middle" id="item-left-follow-btn"
+ ></span>
+</div>
+
+
+
+ <div class="sidebar-box sidebar-box--social visible-desktop">
+ <div class="sidebar-box--social-title">Share</div>
+ <div class="d-flex">
+ <a class="pj-socialism tw-soc" href="http://twitter.com/share?url&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F&amp;via&#x3D;thePeerJ&amp;text&#x3D;The&#x25;20State&#x25;20of&#x25;20OA&amp;related&#x3D;l_matthia&#x25;2Cbree_uw&#x25;2Cashleydfarley" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Twitter</a>
+ <a class="pj-socialism fb-soc" href="http://www.facebook.com/sharer.php?u&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Facebook</a>
+ <a class="pj-socialism em-soc" href="mailto:?Subject&#x3D;Relevant&#x25;20research&#x25;20paper&#x25;20in&#x25;20PeerJ&amp;Body&#x3D;The&#x25;20state&#x25;20of&#x25;20OA&#x25;3A&#x25;20a&#x25;20large-scale&#x25;20analysis&#x25;20of&#x25;20the&#x25;20prevalence&#x25;20and&#x25;20impact&#x25;20of&#x25;20Open&#x25;20Access&#x25;20articles&#x25;20https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Email</a>
+ </div>
+</div>
+
+<div class="btn-group sidebar-box sidebar-box--action">
+ <a href="#" class="btn-share dropdown-toggle" data-toggle="dropdown">Share</a>
+
+ <ul class="dropdown-menu">
+ <li>
+ <a href="http://twitter.com/share?url&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F&amp;via&#x3D;thePeerJ&amp;text&#x3D;The&#x25;20State&#x25;20of&#x25;20OA&amp;related&#x3D;l_matthia&#x25;2Cbree_uw&#x25;2Cashleydfarley" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Twitter</a>
+ </li>
+ <li>
+ <a href="http://www.facebook.com/sharer.php?u&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Facebook</a>
+ </li>
+ <li>
+ <a href="mailto:?Subject&#x3D;Relevant&#x25;20research&#x25;20paper&#x25;20in&#x25;20PeerJ&amp;Body&#x3D;The&#x25;20state&#x25;20of&#x25;20OA&#x25;3A&#x25;20a&#x25;20large-scale&#x25;20analysis&#x25;20of&#x25;20the&#x25;20prevalence&#x25;20and&#x25;20impact&#x25;20of&#x25;20Open&#x25;20Access&#x25;20articles&#x25;20https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Email</a>
+ </li>
+ </ul>
+</div>
+
+ </div>
+
+ </div>
+
+ <div class="peer-reviewed visible-phone">
+ <i class="icon-ok"></i> PEER-REVIEWED
+ </div>
+
+ </div>
+
+ <div id="annotations-sidebar" class="span5"></div>
+
+ <!-- Middle col -->
+ <div id="article-item-middle" class="span7"
+ data-ms-type-entity="articles" data-ms-type-id="research-article" data-ms-type-text="Research-article">
+
+ <div id="article-tweets-container">
+ <div class="row-fluid article-tweets-header">
+ <div class="span9">
+ <h2><em>The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</em></h2>
+ </div>
+ <div class="span3">
+ <div class="btn btn-inverse pull-right" id="btn-view-article"><span class="icon-file"></span> View article</div>
+ </div>
+ </div>
+ <div class="tweet-items"> <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1297703289707016194/-sYklkZs_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=164969574" target="_blank"><strong></strong> <span class="twitter-handle">@LorenAndreaEP</span></a>
+ <span class="item-tweet-date">11 days ago</span>
+ </div>
+ <div>RT @AMAldanaS: También revisamos el tema de la publicación en abierto: tipos y ventajas. Discutimos este artículo de Piwowar y colaboradore…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/LorenAndreaEP/status/1317614486359072769" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/LorenAndreaEP/status/1317614486359072769" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/LorenAndreaEP/status/1317614486359072769" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1293635358064807937/YCE7J6e-_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15271321" target="_blank"><strong>Rachel Borchardt</strong> <span class="twitter-handle">@ButternutSquash</span></a>
+ <span class="item-tweet-date">12 days ago</span>
+ </div>
+ <div>@ces43 May I recommend Piwowar and Priem et al&#039;s article for that topic? https://t.co/Fnm0vtYtKS</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ButternutSquash/status/1317104229358645248" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ButternutSquash/status/1317104229358645248" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/ButternutSquash/status/1317104229358645248" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1210228942415814656/L6yRkSyu_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1117109826" target="_blank"><strong>Ana M. Aldana</strong> <span class="twitter-handle">@AMAldanaS</span></a>
+ <span class="item-tweet-date">40 days ago</span>
+ </div>
+ <div>También revisamos el tema de la publicación en abierto: tipos y ventajas. Discutimos este artículo de Piwowar y colaboradores de 2018 en donde se evidencia la ventaja de publicar en green open access: . https://t.co/1HAmYlfoBP</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/AMAldanaS/status/1306761873900044290" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/AMAldanaS/status/1306761873900044290" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/AMAldanaS/status/1306761873900044290" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/982225468286840837/BM5R0jJh_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=982223918223130624" target="_blank"><strong>Scicomm</strong> <span class="twitter-handle">@ScicommBot</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ScicommBot/status/1298798812220346368" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/ScicommBot/status/1298798812220346368" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/ScicommBot/status/1298798812220346368" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1298797962437357568" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1298797962437357568" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1298797962437357568" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/879796293132050432/ywML6RLZ_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=879783542498217984" target="_blank"><strong>Open Science</strong> <span class="twitter-handle">@_open_science_</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1298795865247801345" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1298795865247801345" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/_open_science_/status/1298795865247801345" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1041368086765559808/9wrfnnLk_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=747439693801848832" target="_blank"><strong>In&amp;Vertebrates</strong> <span class="twitter-handle">@InandVertebrate</span></a>
+ <span class="item-tweet-date">62 days ago</span>
+ </div>
+ <div>How many articles are published in Open Access every year?
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1298795617167147009" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1298795617167147009" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/InandVertebrate/status/1298795617167147009" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/856499301358477312/GLL-DiUg_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=850296415708471297" target="_blank"><strong>Open Pharma</strong> <span class="twitter-handle">@_OpenPharma</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_OpenPharma/status/1288751662912462848" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_OpenPharma/status/1288751662912462848" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/_OpenPharma/status/1288751662912462848" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/879796293132050432/ywML6RLZ_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=879783542498217984" target="_blank"><strong>Open Science</strong> <span class="twitter-handle">@_open_science_</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1288734888577961984" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/_open_science_/status/1288734888577961984" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/_open_science_/status/1288734888577961984" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1288734146982850560" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1288734146982850560" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1288734146982850560" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1041368086765559808/9wrfnnLk_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=747439693801848832" target="_blank"><strong>In&amp;Vertebrates</strong> <span class="twitter-handle">@InandVertebrate</span></a>
+ <span class="item-tweet-date">90 days ago</span>
+ </div>
+ <div>How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1288733817323376640" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1288733817323376640" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/InandVertebrate/status/1288733817323376640" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/820790537456226304/Tis8dyhv_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15137538" target="_blank"><strong>Jason Priem</strong> <span class="twitter-handle">@jasonpriem</span></a>
+ <span class="item-tweet-date">102 days ago</span>
+ </div>
+ <div>@Mietmensch @unpaywall Gotcha. It&#039;s tough to generalize the answer to that, as it depends a lot on the specific journal and field. We dove into the details more in this paper, though: https://t.co/HRus7k3P0B</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1284579350273077248" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1284579350273077248" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/jasonpriem/status/1284579350273077248" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/820790537456226304/Tis8dyhv_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15137538" target="_blank"><strong>Jason Priem</strong> <span class="twitter-handle">@jasonpriem</span></a>
+ <span class="item-tweet-date">103 days ago</span>
+ </div>
+ <div>@dwhly @unpaywall @hpiwowar historical stats are in here: https://t.co/HRus7k3P0B
+
+prediction for future is here: https://t.co/ex0vvThc9G</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283946401492119552" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283946401492119552" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/jasonpriem/status/1283946401492119552" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/456347532637896704/We-tZ-rF_normal.jpeg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=13616592" target="_blank"><strong>Eric Sieverts</strong> <span class="twitter-handle">@sieverts</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>RT @jasonpriem: @egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for valu…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/sieverts/status/1283676444158308352" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/sieverts/status/1283676444158308352" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/sieverts/status/1283676444158308352" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/633201529575632897/5rB4RNtd_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=163244377" target="_blank"><strong>Hector Keun</strong> <span class="twitter-handle">@hectorkeun</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>RT @OxonAndrew: A look ‘under the hood’ of open access publishing:
+
+“The state of OA: a large-scale analysis of the prevalence and impact o…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/hectorkeun/status/1283670319841116162" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/hectorkeun/status/1283670319841116162" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/hectorkeun/status/1283670319841116162" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1233869298344611840/suKOWJtS_normal.png"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1024381399447613443" target="_blank"><strong>Asynchrony</strong> <span class="twitter-handle">@temporalization</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>RT @egonwillighagen: the vast majority of research cannot be accessed if you do not have a big pile of money #openaccess https://t.co/RZ7UJ…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/temporalization/status/1283659204922875904" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/temporalization/status/1283659204922875904" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/temporalization/status/1283659204922875904" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/447652981291614208/RtR2dZtC_normal.jpeg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=536409536" target="_blank"><strong>Andrew Singer</strong> <span class="twitter-handle">@OxonAndrew</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>A look ‘under the hood’ of open access publishing:
+
+“The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles†â¦@thePeerJâ© https://t.co/yCu96hCzMK</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OxonAndrew/status/1283655402773786625" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OxonAndrew/status/1283655402773786625" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OxonAndrew/status/1283655402773786625" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/668462090655371264/SBzaDNdf_normal.png"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=22911650" target="_blank"><strong>Egon Willighâ“gen</strong> <span class="twitter-handle">@egonwillighagen</span></a>
+ <span class="item-tweet-date">104 days ago</span>
+ </div>
+ <div>the vast majority of research cannot be accessed if you do not have a big pile of money #openaccess https://t.co/RZ7UJV72Uf https://t.co/DE9MPIKTdZ</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283654069815586817" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283654069815586817" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/egonwillighagen/status/1283654069815586817" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/668462090655371264/SBzaDNdf_normal.png"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=22911650" target="_blank"><strong>Egon Willighâ“gen</strong> <span class="twitter-handle">@egonwillighagen</span></a>
+ <span class="item-tweet-date">105 days ago</span>
+ </div>
+ <div>RT @jasonpriem: @egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for valu…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283497221950976006" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/egonwillighagen/status/1283497221950976006" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/egonwillighagen/status/1283497221950976006" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/820790537456226304/Tis8dyhv_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=15137538" target="_blank"><strong>Jason Priem</strong> <span class="twitter-handle">@jasonpriem</span></a>
+ <span class="item-tweet-date">105 days ago</span>
+ </div>
+ <div>@egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for values.</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283494738251800576" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/jasonpriem/status/1283494738251800576" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/jasonpriem/status/1283494738251800576" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1220321309411942408/nhm-dSur_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1215236299344502791" target="_blank"><strong>Open Science Community Maastricht</strong> <span class="twitter-handle">@OSCMaastricht</span></a>
+ <span class="item-tweet-date">115 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018
+https://t.co/xkUMWA…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OSCMaastricht/status/1279836423529680897" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OSCMaastricht/status/1279836423529680897" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OSCMaastricht/status/1279836423529680897" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">115 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018
+https://t.co/xkUMWA…</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1279749950268563460" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1279749950268563460" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1279749950268563460" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1041368086765559808/9wrfnnLk_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=747439693801848832" target="_blank"><strong>In&amp;Vertebrates</strong> <span class="twitter-handle">@InandVertebrate</span></a>
+ <span class="item-tweet-date">115 days ago</span>
+ </div>
+ <div>The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018
+https://t.co/xkUMWA5jbJ
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1279746851051200513" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/InandVertebrate/status/1279746851051200513" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/InandVertebrate/status/1279746851051200513" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1263564961068077059/CKFX9dV2_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=371391064" target="_blank"><strong>Marie E McVeigh</strong> <span class="twitter-handle">@JopieNet</span></a>
+ <span class="item-tweet-date">121 days ago</span>
+ </div>
+ <div>@lisalibrarian @ashleydfarley @andy_nobes Usual def of &quot;bronze&quot; in @our_research is free to read, but does not have CC license.
+https://t.co/T34fQja0nN</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/JopieNet/status/1277662956373921792" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/JopieNet/status/1277662956373921792" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/JopieNet/status/1277662956373921792" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+ <div class="row-fluid tweet-item">
+ <div class="span1 offset1"><img src="https://pbs.twimg.com/profile_images/1264543181099528193/4WTe1NqL_normal.jpg"></div>
+ <div class="span8">
+ <div>
+ <a class="twitter-profile-url" href="https://twitter.com/intent/user/?user_id=1252313225011449856" target="_blank"><strong>OpenSci Talk</strong> <span class="twitter-handle">@OpenSciTalk</span></a>
+ <span class="item-tweet-date">146 days ago</span>
+ </div>
+ <div>RT @InandVertebrate: How many articles are published in Open Access every year?
+https://t.co/xkUMWzNIkb
+#openaccess #openscience #scicomm</div>
+ <div class="item-tweet-cta">
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1268621662469017601" target="_blank"><i class="icon-comment-alt"></i> reply</a></span>
+ <span class="item-tweet-cta-action"><a href="https://twitter.com/OpenSciTalk/status/1268621662469017601" target="_blank"><i class="icon-retweet"></i> retweet</a></span>
+ <span><a href="https://twitter.com/OpenSciTalk/status/1268621662469017601" target="_blank"><i class="icon-heart-empty"></i> like</a></span>
+ </div>
+ </div>
+ </div>
+
+<div class="tweet-pagination pagination">
+
+ <ul>
+
+ <li class="active"><a href="#">1</a></li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=2" class="page">2</a>
+ </li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=3" class="page">3</a>
+ </li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=4" class="page">4</a>
+ </li>
+
+ <li>
+ <a href="/articles/4375/tweets/?page=5" class="page">5</a>
+ </li>
+
+
+ <li>
+ <a href="/articles/4375/tweets/?page=2">Next</a>
+ </li>
+ </ul>
+
+ <hr>
+</div></div>
+</div>
+ <div id="article-main-container">
+ <div class="article-section-breadcrumb">
+ <span class="icon-angle-left"></span>
+ <span><a href="/"><em>PeerJ</em></a></span>
+ </div>
+
+
+ <div class="hidden-print">
+
+ <div id="article-preexisting" class="well peerj-paper-well" >
+ <i class="icon-pushpin icon-large"></i> Note that a <a href="/preprints/3119/">Preprint of this article</a> also exists, first published August 2, 2017.
+ </div>
+ </div>
+
+ <!-- Main article -->
+ <article itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle"><header class="article-meta front"><h1 class="article-title" itemprop="name headline">The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</h1>
+<div class="article-authors">
+<span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-1" data-jats-contrib-type="author" data-jats-corresp="yes" data-jats-equal-contrib="yes" itemprop="author"><a href="author-1" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Heather</span> <span class="surname" itemprop="familyName">Piwowar</span></span></a><a class="corresp" href="mailto:heather@impactstory.org" target="_blank" title="email the corresponding author" data-toggle="tooltip" itemprop="email"><i class="icon-envelope">​</i></a><span class="equal-contribution" title="These authors contributed equally to this work." data-toggle="tooltip"><i class="icon-asterisk">​</i></span><sup class="contrib-xref-group"><a class="aff xref" href="#aff-1" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-1">1</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-2" data-jats-contrib-type="author" data-jats-corresp="yes" data-jats-equal-contrib="yes" itemprop="author"><a href="author-2" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Jason</span> <span class="surname" itemprop="familyName">Priem</span></span></a><a class="corresp" href="mailto:jason@impactstory.org" target="_blank" title="email the corresponding author" data-toggle="tooltip" itemprop="email"><i class="icon-envelope">​</i></a><span class="equal-contribution" title="These authors contributed equally to this work." data-toggle="tooltip"><i class="icon-asterisk">​</i></span><sup class="contrib-xref-group"><a class="aff xref" href="#aff-1" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-1">1</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-3" data-jats-contrib-type="author" itemprop="author"><a href="author-3" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Vincent</span> <span class="surname" itemprop="familyName">Larivière</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-2" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-2">2</a>,<a class="aff xref" href="#aff-3" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-3">3</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-4" data-jats-contrib-type="author" itemprop="author"><a href="author-4" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Juan Pablo</span> <span class="surname" itemprop="familyName">Alperin</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-4" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-4">4</a>,<a class="aff xref" href="#aff-5" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-5">5</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-5" data-jats-contrib-type="author" itemprop="author"><a href="author-5" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Lisa</span> <span class="surname" itemprop="familyName">Matthias</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-6" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-6">6</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-6" data-jats-contrib-type="author" itemprop="author"><a href="author-6" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Bree</span> <span class="surname" itemprop="familyName">Norlander</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-7" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-7">7</a>,<a class="aff xref" href="#aff-8" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-8">8</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-7" data-jats-contrib-type="author" itemprop="author"><a href="author-7" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Ashley</span> <span class="surname" itemprop="familyName">Farley</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-7" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-7">7</a>,<a class="aff xref" href="#aff-8" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-8">8</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-8" data-jats-contrib-type="author" itemprop="author"><a href="author-8" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Jevin</span> <span class="surname" itemprop="familyName">West</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-7" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-7">7</a></sup></span>, <span class="contrib" itemscope="itemscope" itemtype="http://schema.org/Person" id="author-9" data-jats-contrib-type="author" itemprop="author"><a href="author-9" rel="author" itemprop="url"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Stefanie</span> <span class="surname" itemprop="familyName">Haustein</span></span></a><sup class="contrib-xref-group"><a class="aff xref" href="#aff-3" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-3">3</a>,<a class="aff xref" href="#aff-9" itemprop="affiliation" itemscope="itemscope" itemtype="http://schema.org/Organization" itemref="aff-9">9</a></sup></span>
+</div>
+<div id="article-information">
+<div class="article-notes">
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-1">
+<span class="article-label-container"><a class="article-label">1</a></span><span itemprop="address"><span class="institution">Impactstory</span>, <span class="city">Sanford</span>, <span class="state">NC</span>, <span class="country">USA</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-2">
+<span class="article-label-container"><a class="article-label">2</a></span><span itemprop="address"><span class="institution">École de bibliothéconomie et des sciences de l’information, Université de Montréal</span>, <span class="city">Montréal</span>, <span class="state">QC</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-3">
+<span class="article-label-container"><a class="article-label">3</a></span><span itemprop="address"><span class="institution">Observatoire des Sciences et des Technologies (OST), Centre Interuniversitaire de Recherche sur la Science et la Technologie (CIRST), Université du Québec à Montréal</span>, <span class="city">Montréal</span>, <span class="state">QC</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-4">
+<span class="article-label-container"><a class="article-label">4</a></span><span itemprop="address"><span class="institution">Canadian Institute for Studies in Publishing, Simon Fraser University</span>, <span class="city">Vancouver</span>, <span class="state">BC</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-5">
+<span class="article-label-container"><a class="article-label">5</a></span><span itemprop="address"><span class="institution">Public Knowledge Project</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-6">
+<span class="article-label-container"><a class="article-label">6</a></span><span itemprop="address"><span class="institution">Scholarly Communications Lab, Simon Fraser University</span>, <span class="city">Vancouver</span>, <span class="country">Canada</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-7">
+<span class="article-label-container"><a class="article-label">7</a></span><span itemprop="address"><span class="institution">Information School, University of Washington</span>, <span class="city">Seattle</span>, <span class="country">USA</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-8">
+<span class="article-label-container"><a class="article-label">8</a></span><span itemprop="address"><span class="institution">FlourishOA</span>, <span class="country">USA</span></span>
+</div>
+<div itemscope="itemscope" itemtype="http://schema.org/Organization" id="aff-9">
+<span class="article-label-container"><a class="article-label">9</a></span><span itemprop="address"><span class="institution">School of Information Studies, University of Ottawa</span>, <span class="city">Ottawa</span>, <span class="state">ON</span>, <span class="country">Canada</span></span>
+</div>
+</div>
+<dl class="article-identifiers">
+<dt> DOI</dt>
+<dd>
+<a href="https://doi.org/10.7717/peerj.4375" itemprop="sameAs">10.7717/peerj.4375</a><meta itemprop="sameAs" content="info:doi/10.7717/peerj.4375">
+</dd>
+</dl>
+<dl class="article-dates">
+<dt>Published</dt>
+<dd><time itemprop="datePublished">2018-02-13</time></dd>
+<dt>Accepted</dt>
+<dd><time data-itemprop="dateAccepted">2018-01-25</time></dd>
+<dt>Received</dt>
+<dd><time itemprop="dateCreated">2017-08-09</time></dd>
+</dl>
+<dl class="article-editors">
+<dt>Academic Editor</dt>
+<dd itemprop="editor" itemscope="itemscope" itemtype="http://schema.org/Person"><a itemprop="url" href="editor-1" class="contrib" data-jats-contrib-type="editor"><span class="name" itemprop="name"><span class="given-names" itemprop="givenName">Robert</span> <span class="surname" itemprop="familyName">McDonald</span></span></a></dd>
+</dl>
+<dl class="article-subjects">
+<dt>Subject Areas</dt>
+<dd>
+<a class="subject" itemprop="about" href="/subjects/?filter=Legal%20Issues">Legal Issues</a>, <a class="subject" itemprop="about" href="/subjects/?filter=Science%20Policy">Science Policy</a>, <a class="subject" itemprop="about" href="/subjects/?filter=Data%20Science">Data Science</a>
+</dd>
+<dt>Keywords</dt>
+<dd>
+<span class="kwd" itemprop="keywords">Open access</span>, <span class="kwd" itemprop="keywords">Open science</span>, <span class="kwd" itemprop="keywords">Scientometrics</span>, <span class="kwd" itemprop="keywords">Publishing</span>, <span class="kwd" itemprop="keywords">Libraries</span>, <span class="kwd" itemprop="keywords">Scholarly communication</span>, <span class="kwd" itemprop="keywords">Bibliometrics</span>, <span class="kwd" itemprop="keywords">Science policy</span>
+</dd>
+</dl>
+<dl class="article-license">
+<dt>Copyright</dt>
+<dd>© <span itemprop="copyrightYear">2018</span> <span itemprop="copyrightHolder">Piwowar et al.</span>
+</dd>
+<dt>Licence</dt>
+<dd>
+ <span class="license-p">This is an open access article distributed under the terms of the <a class="ext-link" href="http://creativecommons.org/licenses/by/4.0/" rel="license" data-jats-ext-link-type="uri">Creative Commons Attribution License</a>, which permits unrestricted use, distribution, reproduction and adaptation in any medium and for any purpose provided that it is properly attributed. For attribution, the original author(s), title, publication source (PeerJ) and either DOI or URL of the article must be cited.</span>
+ </dd>
+</dl>
+<dl class="self-citation">
+<dt>Cite this article</dt>
+<dd>
+<span class="self-citation-authors">Piwowar H, Priem J, Larivière V, Alperin JP, Matthias L, Norlander B, Farley A, West J, Haustein S.</span> <span class="self-citation-year">2018</span>. <span class="self-citation-title">The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</span>. <span itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="self-citation-journal" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PeerJ</span></span> <span class="self-citation-volume" itemprop="volumeNumber">6</span></span>:<span class="self-citation-elocation" itemprop="pageStart">e4375</span> <a href="https://doi.org/10.7717/peerj.4375" itemprop="url">https://doi.org/10.7717/peerj.4375</a>
+</dd>
+</dl>
+<div class="alert alert-success view-public-reviews">The authors have chosen to make <a href="/articles/4375/reviews/">the review history of this article</a> public.</div>
+</div>
+<div>
+<h2>Abstract</h2>
+<div class="abstract" itemprop="description">
+ <p>Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.</p>
+ </div>
+</div></header><main><div class="body" lang="en">
+ <section class="sec" id="intro">
+ <h2 class="heading">Introduction</h2>
+ <p id="p-1">The movement to provide open access (OA) to all research literature is now over fifteen years old. In the last few years, several developments suggest that after years of work, a sea change is imminent in OA. First, funding institutions are increasingly mandating OA publishing for grantees. In addition to the US National Institutes of Health, which mandated OA in 2008 (<a class="ext-link" href="https://publicaccess.nih.gov/index.htm" data-jats-ext-link-type="uri">https://publicaccess.nih.gov/index.htm</a>), the Bill and Melinda Gates Foundation (<a class="ext-link" href="http://www.gatesfoundation.org/How-We-Work/General-Information/Open-Access-Policy" data-jats-ext-link-type="uri">http://www.gatesfoundation.org/How-We-Work/General-Information/Open-Access-Policy</a>), the European Commission (<a class="ext-link" href="http://ec.europa.eu/research/participants/data/ref/h2020/grants_manual/hi/oa_pilot/h2020-hi-oa-pilot-guide_en.pdf" data-jats-ext-link-type="uri">http://ec.europa.eu/research/participants/data/ref/h2020/grants_manual/hi/oa_pilot/h2020-hi-oa-pilot-guide_en.pdf</a>), the US National Science Foundation (<a class="ext-link" href="https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf" data-jats-ext-link-type="uri">https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf</a>), and the Wellcome Trust (<a class="ext-link" href="https://wellcome.ac.uk/press-release/wellcome-trust-strengthens-its-open-access-policy" data-jats-ext-link-type="uri">https://wellcome.ac.uk/press-release/wellcome-trust-strengthens-its-open-access-policy</a>), among others, have made OA diffusion mandatory for grantees. Second, several tools have sprung up to build value atop the growing OA corpus. These include discovery platforms like ScienceOpen and 1Science, and browser-based extensions like the Open Access Button, Canary Haz, and Unpaywall. Third, Sci-Hub (a website offering pirate access to full text articles) has built an enormous user base, provoking newly intense conversation around the ethics and efficiency of paywall publishing (<a class="xref xref-bibr" href="https://doi.org/10.1126%2Fscience.352.6285.508" title="Who’s downloading pirated papers? Everyone" data-jats-ref-type="bibr" data-jats-rid="ref-13">Bohannon, 2016</a>; <a class="xref xref-bibr" href="https://doi.org/10.12688%2Ff1000research.11366.1" title="Looking into Pandora’s Box: the content of Sci-Hub and its usage [version 1; referees: 2 approved, 2 approved with reservations]" data-jats-ref-type="bibr" data-jats-rid="ref-26">Greshake, 2017</a>). Academic social networks like ResearchGate and Academia.edu now offer authors an increasingly popular but controversial solution to author self-archiving (<a class="xref xref-bibr" href="https://doi.org/10.1016%2Fj.joi.2016.08.002" title="Hybrid open access—a longitudinal study" data-jats-ref-type="bibr" data-jats-rid="ref-8">Björk, 2016a</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fleap.1021" title="The open access movement at a crossroad: are the big publishers and academic social media taking over?" data-jats-ref-type="bibr" data-jats-rid="ref-9">Björk, 2016b</a>). Finally, the increasing growth in the cost of toll-access subscriptions, particularly via so-called “Big Deals†from publishers, has begun to force libraries and other institutions to initiate large-scale subscription cancellations; recent examples include Caltech, the University of Maryland, University of Konstanz, Université de Montréal, and the national system of Peru (<a class="xref xref-bibr" href="http://www.bib.umontreal.ca/communiques/20170504-DC-annulation-taylor-francis-va.htm" title="UdeM Libraries cancel Big Deal subscription to 2231 periodical titles published by Taylor &amp; Francis Group" data-jats-ref-type="bibr" data-jats-rid="ref-48">Université de Montréal, 2017</a>; <a class="xref xref-bibr" href="https://doi.org/10.1038%2Fnature.2016.21223" title="Scientists in Germany, Peru and Taiwan to lose access to Elsevier journals" data-jats-ref-type="bibr" data-jats-rid="ref-41">Schiermeier &amp; Mega, 2017</a>; <a class="xref xref-bibr" href="https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/" title="When the wolf finally arrives: big deal cancelations in North American Libraries" data-jats-ref-type="bibr" data-jats-rid="ref-1">Anderson, 2017a</a>; <a class="xref xref-bibr" href="https://www.uni-konstanz.de/universitaet/aktuelles-und-medien/aktuelle-meldungen/aktuelles/aktuelles/teurer-als-die-wissenschaft-erlaubt/" title="Teurer als die Wissenschaft erlaubt" data-jats-ref-type="bibr" data-jats-rid="ref-47">Université Konstanz, 2014</a>). As the toll-access status quo becomes increasingly unaffordable, institutions are looking to OA as part of their “Plan B†to maintain access to essential literature (<a class="xref xref-bibr" href="http://www.ala.org/acrl/sites/ala.org.acrl/files/content/conferences/confsandpreconfs/2017/LeveragingtheGrowthofOpenAccess.pdf" title="Leveraging the growth of open access in library collection decision making" data-jats-ref-type="bibr" data-jats-rid="ref-3">Antelman, 2017</a>).</p>
+ <p id="p-2">Open access is thus provoking a new surge of investment, controversy, and relevance across a wide group of stakeholders. We may be approaching a moment of great importance in the development of OA, and indeed of the scholarly communication system. However, despite the recent flurry of development and conversation around OA, there is a need for large-scale, high-quality data on the growth and composition of the OA literature itself. In particular, there is a need for a data-driven “state of OA†overview that is (a) large-scale, (b) up-to-date, and (c) reproducible. This paper attempts to provide such an overview, using a new open web service called oaDOI that finds links to legally-available OA scholarly articles.<a class="xref xref-fn" href="#fn-1" data-jats-ref-type="fn" data-jats-rid="fn-1"><sup>1</sup></a> Building on data provided by the oaDOI service, we answer the following questions:</p>
+ <ol class="list" id="list-1" data-jats-list-type="order">
+ <li class="list-item">
+<p id="p-4">What percentage of the scholarly literature is OA, and how does this percentage vary according to publisher, discipline, and publication year?</p>
+ </li>
+ <li class="list-item">
+<p id="p-5">Are OA papers more highly-cited than their toll-access counterparts?</p>
+ </li>
+ </ol>
+ <p id="p-6">The next section provides a brief review of the background literature for this paper, followed by a description of the datasets and methods used, as well as details on the definition and accuracy of the oaDOI categorization. Results are then presented, in turn, for each research question, and are followed by a general discussion and conclusions.</p>
+ </section>
+ <section class="sec">
+ <h2 class="heading">Literature Review</h2>
+ <p id="p-7">Fifteen years of OA research have produced a significant body of literature, a complete review of which falls outside the scope of this paper (for recent, in-depth reviews, see <a class="xref xref-bibr" href="https://doi.org/10.12688%2Ff1000research.8460.3" title="The academic, economic and societal impacts of Open Access: an evidence-based review (version 3; referees: 3 approved, 2 approved with reservations)" data-jats-ref-type="bibr" data-jats-rid="ref-46">Tennant et al. (2016)</a> and <a class="xref xref-bibr" href="https://doi.org/10.7554%2FeLife.16800" title="How open science helps researchers succeed" data-jats-ref-type="bibr" data-jats-rid="ref-36">McKiernan et al. (2016)</a>. Here we instead briefly review three major topics from the OA literature: defining OA and its subtypes, assessing the prevalence of OA, and examining the relative citation impact of OA.</p>
+ <p id="p-8">Despite the large literature on OA, the term itself remains “somewhat fluid†(Antelman, 2004), making an authoritative definition challenging. The most influential definition of OA comes from the 2002 Budapest Open Access Initiative (BOAI), and defines OA as making content both <i>free to read</i> and <i>free to reuse</i>, requiring the opportunity of OA users to “crawl (articles) for indexing, pass them as data to software, or use them for any other lawful purpose.†In practice, the BOAI definition is roughly equivalent to the popular “CC-BY†Creative Commons license (<a class="xref xref-bibr" href="https://creativecommons.org/licenses/by/4.0/" title="Attribution 4.0 International (CC BY 4.0)" data-jats-ref-type="bibr" data-jats-rid="ref-19">Creative Commons, 2018</a>). However, a number of other sources prefer a less strict definition, requiring only that OA “makes the research literature free to read online†(<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=The%20nine%20flavours%20of%20open%20access%20scholarly%20publishing&amp;author=Willinsky&amp;publication_year=2003" title="The nine flavours of open access scholarly publishing" data-jats-ref-type="bibr" data-jats-rid="ref-51">Willinsky, 2003</a>), or that it is “digital, online, [and] free of charge.†(<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=Status%20of%20open%20access%20in%20the%20biomedical%20field%20in%202005&amp;author=Matsubayashi&amp;publication_year=2009" title="Status of open access in the biomedical field in 2005" data-jats-ref-type="bibr" data-jats-rid="ref-34">Matsubayashi et al., 2009</a>). Others have suggested it is more valuable to think of OA as a spectrum (<a class="xref xref-bibr" href="https://doi.org/10.1080%2F00987913.2016.1182672" title="Measuring the degrees of openness of scholarly journals with the open access spectrum (OAS) evaluation tool" data-jats-ref-type="bibr" data-jats-rid="ref-17">Chen &amp; Olijhoek, 2016</a>).</p>
+ <p id="p-9">Researchers have identified a number of subtypes of OA; some of these have near-universal support, while others remain quite controversial. We will not attempt a comprehensive list of these, but instead note several that have particular relevance for the current study.</p>
+ <ul class="list" id="list-2" data-jats-list-type="bullet">
+ <li class="list-item">
+<p id="p-10">Libre OA (<a class="xref xref-bibr" href="https://dash.harvard.edu/handle/1/4322580" title="Gratis and libre open access" data-jats-ref-type="bibr" data-jats-rid="ref-44">Suber, 2008</a>): extends user’s rights to read and also to reuse literature for purposes like automated crawling, archiving, or other purposes. The Libre OA definition is quite similar to the BOAI definition of OA.</p>
+ </li>
+ <li class="list-item">
+<p id="p-11">Gratis OA (<a class="xref xref-bibr" href="https://dash.harvard.edu/handle/1/4322580" title="Gratis and libre open access" data-jats-ref-type="bibr" data-jats-rid="ref-44">Suber, 2008</a>): in contrast to Libre, Gratis extends <i>only</i> rights to read articles.</p>
+ </li>
+ <li class="list-item">
+<p id="p-12">Gold OA: articles are published in an “OA journal,†a journal in which all articles are open directly on the journal website. In practice, OA journals are most often defined by their inclusion in the Directory of Open Access Journals (DOAJ) (<a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al., 2014</a>; <a class="xref xref-bibr" href="http://arxiv.org/abs/1206.3664" title="Green and gold open access percentages and growth, by discipline" data-jats-ref-type="bibr" data-jats-rid="ref-24">Gargouri et al., 2012</a>).</p>
+ </li>
+ <li class="list-item">
+<p id="p-13">Green OA: Green articles are published in a toll-access journal, but self-archived in an OA archive. These “OA archives†are either disciplinary repositories like ArXiv, or “institutional repositories (IRs) operated by universities, and the archived articles may be either the published versions, or electronic preprints (<a class="xref xref-bibr" href="https://doi.org/10.1080%2F00987913.2008.10765150" title="The access/impact problem and the green and gold roads to open access: an update" data-jats-ref-type="bibr" data-jats-rid="ref-28">Harnad et al., 2008</a>). Most Green OA articles do not meet the BOAI definition of OA since they do not extend reuse rights (making them Gratis OA).</p>
+ </li>
+ <li class="list-item">
+<p id="p-14">Hybrid OA: articles are published in a subscription journal but are immediately free to read under an open license, in exchange for an an article processing charge (APC) paid by authors (<a class="xref xref-bibr" href="https://doi.org/10.1241%2Fjohokanri.41.678" title="Free internet access to traditional journals" data-jats-ref-type="bibr" data-jats-rid="ref-50">Walker &amp; Soichi, 1998</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fasi.22856" title="Delayed open access: an overlooked high-impact category of openly available scientific literature" data-jats-ref-type="bibr" data-jats-rid="ref-32">Laakso &amp; Björk, 2013</a>).</p>
+ </li>
+ <li class="list-item">
+<p id="p-15">Delayed OA: articles are published in a subscription journal, but are made free to read after an embargo period (<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=The%20access%20principle:%20the%20case%20for%20open%20access%20to%20research%20and%20scholarship&amp;author=Willinsky&amp;publication_year=2009" title="The access principle: the case for open access to research and scholarship" data-jats-ref-type="bibr" data-jats-rid="ref-52">Willinsky, 2009</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fasi.22856" title="Delayed open access: an overlooked high-impact category of openly available scientific literature" data-jats-ref-type="bibr" data-jats-rid="ref-32">Laakso &amp; Björk, 2013</a>).</p>
+ </li>
+ <li class="list-item">
+<p id="p-16">Academic Social Networks (ASN): Articles are shared by authors using commercial online social networks like ResearchGate and Academia.edu. While some include these in definitions of OA (<a class="xref xref-bibr" href="http://www.science-metrix.com/pdf/SM_EC_OA_Availability_2004-2011.pdf" title="Proportion of open access peer-reviewed papers at the European and world levels–2004–2011" data-jats-ref-type="bibr" data-jats-rid="ref-4">Archambault et al., 2013</a>; <a class="xref xref-bibr" href="https://doi.org/10.1002%2Fleap.1021" title="The open access movement at a crossroad: are the big publishers and academic social media taking over?" data-jats-ref-type="bibr" data-jats-rid="ref-9">Björk, 2016b</a>), others argue that content shared on ASNs is not OA at all. Unlike Green OA repositories, ASNs do not check for copyright compliance, and therefore as much as half their content is illegally posted and hosted (<a class="xref xref-bibr" href="https://doi.org/10.1007%2Fs11192-017-2291-4" title="Copyright compliance and infringement in ResearchGate full-text journal articles" data-jats-ref-type="bibr" data-jats-rid="ref-30">Jamali, 2017</a>). This raises concerns over the persistence of content, since, as was the case in October 2017, publishers can and do issue large-scale takedown notices to ASN ordering the removal of infringing content (<a class="xref xref-bibr" href="http://www.sciencemag.org/news/2017/10/publishers-take-researchgate-court-alleging-massive-copyright-infringement" title="Publishers take ResearchGate to court, alleging massive copyright infringement" data-jats-ref-type="bibr" data-jats-rid="ref-15">Chawla, 2017</a>). Others have raised questions about the sustainability and ethics of ASN services themselves (<a class="xref xref-bibr" href="http://osc.universityofcalifornia.edu/2015/12/a-social-networking-site-is-not-an-open-access-repository/index.html" title="A social networking site is not an open access repository" data-jats-ref-type="bibr" data-jats-rid="ref-22">Fortney &amp; Gonder, 2015</a>). Due to these concerns, and inconsistent support from the literature, we exclude ASN-hosted content from our definition of OA.<a class="xref xref-fn" href="#fn-2" data-jats-ref-type="fn" data-jats-rid="fn-2"><sup>2</sup></a> </p>
+ </li>
+ <li class="list-item">
+<p id="p-18">“Black OAâ€: Articles shared on illegal pirate sites, primarily Sci-Hub and LibGen. Although (<a class="xref xref-bibr" href="https://doi.org/10.1002%2Fleap.1096" title="Gold, green, and black open access" data-jats-ref-type="bibr" data-jats-rid="ref-10">Björk, 2017</a>) labels these articles as a subtype of OA, the literature has nearly no support for including Sci-Hub articles in definitions of OA. Given this, we exclude Sci-Hub and LibGen content from our definition of OA.</p>
+ </li>
+ </ul>
+ <p id="p-19">Based on the consensus (and in some cases, lack of consensus) around these definitions and subtypes, we will use the following definition of OA in the remainder of this paper: <b>OA articles are free to read online, either on the publisher website or in an OA repository.</b></p>
+ <section class="sec">
+ <h3 class="heading">Prevalence of OA</h3>
+ <p id="p-20">Many studies have estimated what proportion of the literature is available OA, including <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0011273" title="Open access to the scientific journal literature: situation 2009" data-jats-ref-type="bibr" data-jats-rid="ref-12">Björk et al. (2010)</a>, <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0020961" title="The development of open access journal publishing from 1993 to 2009" data-jats-ref-type="bibr" data-jats-rid="ref-33">Laakso et al. (2011)</a>, <a class="xref xref-bibr" href="https://doi.org/10.1186%2F1741-7015-10-124" title="Anatomy of open access publishing: a study of longitudinal development and internal structure" data-jats-ref-type="bibr" data-jats-rid="ref-31">Laakso &amp; Björk (2012)</a>, <a class="xref xref-bibr" href="http://arxiv.org/abs/1206.3664" title="Green and gold open access percentages and growth, by discipline" data-jats-ref-type="bibr" data-jats-rid="ref-24">Gargouri et al. (2012)</a>, <a class="xref xref-bibr" href="http://www.science-metrix.com/pdf/SM_EC_OA_Availability_2004-2011.pdf" title="Proportion of open access peer-reviewed papers at the European and world levels–2004–2011" data-jats-ref-type="bibr" data-jats-rid="ref-4">Archambault et al. (2013)</a>, <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> and <a class="xref xref-bibr" href="https://doi.org/10.1080%2F19322909.2013.795426" title="Journal article retrieval in an age of Open Access: how journal indexes indicate Open Access articles" data-jats-ref-type="bibr" data-jats-rid="ref-16">Chen (2013)</a>. We are not aware of any studies since 2014. The most recent two analyses estimate that more than 50% of papers are now freely available online, when one includes both OA and ASNs. <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a>, the most comprehensive study to date, estimates that of papers published between 2011 and 2013, 12% of articles could be retrieved from the journal website, 6% from repositories, and 31% by other mechanisms (including ASNs). <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> also found that the availability of papers published between 1996 and 2011 increased by 4% between April 2013 and April 2014, noting that “backfilling†is a significant contributor to green OA. Their discipline-level analysis confirmed the findings of other studies, that the proportion of OA is relatively high in biomedical research and math, while notably low in engineering, chemistry, and the humanities.</p>
+ <p id="p-21">This <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> study is of particular interest because it used automated web scraping to find and identify OA content; most earlier efforts have relied on laborious manual checking of the DOAJ, publisher webpages, Google, and/or Google Scholar (though see <a class="xref xref-bibr" href="http://arxiv.org/abs/cs/0606079" title="Ten-year cross-disciplinary comparison of the growth of open access and how it increases research citation impact" data-jats-ref-type="bibr" data-jats-rid="ref-27">Hajjem, Harnad &amp; Gingras (2006)</a> for a notable early exception). By using automated methods, Archambault et al. were able to sample hundreds of thousands of articles, greatly improving statistical power and supporting more nuanced inferences. Moreover, by creating a system that indexes OA content, they address a major concern in the world of OA research; as <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0020961" title="The development of open access journal publishing from 1993 to 2009" data-jats-ref-type="bibr" data-jats-rid="ref-33">Laakso et al. (2011)</a> observes: “A major challenge for research...has been the lack of comprehensive indexing for both OA journals and their articles.†The automated system of <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> is very accurate—it only misclassifies a paper as OA 1% of the time, and finds about 75% of all OA papers that exist online, as per <a class="xref xref-bibr" href="https://digitalcommons.unl.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&amp;httpsredir=1&amp;article=1028&amp;context=scholcom" title="Research impact of paywalled versus open access papers" data-jats-ref-type="bibr" data-jats-rid="ref-6">Archambault et al. (2016)</a>. However, the algorithm is not able to distinguish Gold from Hybrid OA. More problematically for researchers, the database used in the study is not open online for use in follow-up research. Instead, the data has since been used to build the commercial subscription-access database 1science (<a class="ext-link" href="http://www.1science.com/oanumbr.html" data-jats-ext-link-type="uri">http://www.1science.com/oanumbr.html</a>).</p>
+ </section>
+ <section class="sec">
+ <h3 class="heading">The open access citation advantage</h3>
+ <p id="p-22">Several dozen studies have compared the citation counts of OA articles and toll-access articles. Most of these have reported higher citation counts for OA, suggesting a so-called “open access citation advantage†(OACA); several annotated bibliographies have been created to track this literature (<a class="xref xref-bibr" href="http://sparceurope.org/what-we-do/open-access/sparc-europe-open-access-resources/open-access-citation-advantage-service-oaca/oaca-list/" title="The open access citation advantage: list of studies until 2015" data-jats-ref-type="bibr" data-jats-rid="ref-43">SPARC Europe, 2015</a>; <a class="xref xref-bibr" href="https://doi.org/10.5062%2FF4Q81B0W" title="Open access citation advantage: an annotated bibliography" data-jats-ref-type="bibr" data-jats-rid="ref-49">Wagner, 2010</a>; <a class="xref xref-bibr" href="https://www.scienceopen.com/search#%7B%22order%22%3A0%2C%22context%22%3A%7B%22collection%22%3A%7B%22id%22%3A%22996823e0-8104-4490-b26a-f2f733f810fb%22%2C%22kind%22%3A0%7D%2C%22kind%22%3A11%7D%2C%22kind%22%3A77%7D" title="The open access citation advantage" data-jats-ref-type="bibr" data-jats-rid="ref-45">Tennant, 2017</a>). The OACA is not universally supported. Many studies supporting the OACA have been criticised on methodological grounds (<a class="xref xref-bibr" href="https://doi.org/10.3163%2F1536-5050.99.3.008" title="The impact of free access to the scientific literature: a review of recent research" data-jats-ref-type="bibr" data-jats-rid="ref-21">Davis &amp; Walters, 2011</a>), and an investigation using the randomized-control trial method failed to find evidence of an OACA (<a class="xref xref-bibr" href="https://doi.org/10.1096%2Ffj.11-183988" title="Open access, readership, citations: a randomized controlled trial of scientific journal publishing" data-jats-ref-type="bibr" data-jats-rid="ref-20">Davis, 2011</a>). However, recent investigations using robust methods have continued to observe an OACA. For instance, <a class="xref xref-bibr" href="https://doi.org/10.1111%2Fecin.12064" title="Identifying the effect of open access on citations using a panel of science journals" data-jats-ref-type="bibr" data-jats-rid="ref-35">McCabe &amp; Snyder (2014)</a> used a complex statistical model to remove confounding effects of author selection (authors may selectively publish their higher-impact work as OA), reporting a small but meaningful 8% OACA. <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> describe a 40% OACA in a massive sample of over one million articles using field-normalized citation rates. <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0159614" title="The post-embargo open access citation advantage: it exists (probably), it’s modest (usually), and the rich get richer (of course)" data-jats-ref-type="bibr" data-jats-rid="ref-38">Ottaviani (2016)</a> used a natural experiment as articles (not selected by authors) emerged from embargoes to become OA, and reports a 19% OACA excluding the author self-selection bias for older articles outside their prime citation years.</p>
+ </section>
+ </section>
+ <section class="sec" id="methods">
+ <h2 class="heading">Methods</h2>
+ <section class="sec">
+ <h3 class="heading">OA determination</h3>
+ <section class="sec">
+ <h4 class="heading">Classifications</h4>
+ <p id="p-23">We classify publications into two categories, OA and Closed. As described above, we define OA as <i>free to read online, either on the publisher website or in an OA repository</i>; all articles not meeting this definition were defined as Closed. We further divide the OA literature into one of four exclusive subcategories, resulting in a five-category classification system for articles:</p>
+ <ul class="list" id="list-3" data-jats-list-type="bullet">
+ <li class="list-item">
+<p id="p-24"><b>Gold</b>: Published in an open-access journal that is indexed by the DOAJ.</p>
+ </li>
+ <li class="list-item">
+<p id="p-25"><b>Green</b>: Toll-access on the publisher page, but there is a free copy in an OA repository.</p>
+ </li>
+ <li class="list-item">
+<p id="p-26"><b>Hybrid</b>: Free under an open license in a toll-access journal.</p>
+ </li>
+ <li class="list-item">
+<p id="p-27"><b>Bronze</b>: Free to read on the publisher page, but without an clearly identifiable license.</p>
+ </li>
+ <li class="list-item">
+<p id="p-28"><b>Closed</b>: All other articles, including those shared only on an ASN or in Sci-Hub.</p>
+ </li>
+ </ul>
+ <p id="p-29">These categories are largely consistent with their use throughout the OA literature, although a few clarifications are useful. First, we (like many other OA studies) do not include ASN-hosted content as OA. Second, categories are exclusive, and publisher-hosted content takes precedence over self-archived content. This means that if an article is posted in both a Gold journal and an OA repository, we would classify it as Gold, not Green. Put another way, publisher-hosted content can “shadow†archived articles that would otherwise be Green. This definition of Green (“available in a repository but <i>not</i> available from the publisherâ€) is often used in the OA literature (including by Steven Harnad, the coiner of the Green and Gold terms <a class="xref xref-bibr" href="https://doi.org/10.1080%2F00987913.2008.10765150" title="The access/impact problem and the green and gold roads to open access: an update" data-jats-ref-type="bibr" data-jats-rid="ref-28">Harnad et al., 2008</a>), but this usage is not unanimous. Some studies allow a given article to be <i>both</i> Gold and Green; compared to these, our classification system does undercount Green. Hybrid articles share properties with Gold articles (both are free to read and are licensed for re-use), but differ in the venue of publication (i.e., Hybrid articles are published in journals not considered open access by the DOAJ) and in that Hybrid articles are not necessarily immediately available (i.e., they may only be freely available after an embargo). We also add a novel subcategory, Bronze. Bronze shares attributes of Gold and Hybrid; like both, Bronze OA articles are publisher-hosted. Unlike Gold OA, Bronze articles are not published in journals considered open access in the DOAJ. Unlike Hybrid, Bronze articles carry no license information. Although this lack of identifiable license may not be intentional, without an identifiable license, the articles are free to read but do not allow extended reuse rights beyond reading. It is also not clear if Bronze articles are temporarily or permanently available to read for free.</p>
+ <p id="p-30">Finally, we should add that, although our categories of choice reflect the OA literature, they do not necessarily reflect the more complex reality of scholarly publishing today. Organizations like SciELO and Redalyc in Latin America have been acting simultaneously as publishers and repositories and many of the articles found on their site do not fall neatly into the above categories (<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=The%20SciELO%20open%20access:%20a%20gold%20way%20from%20the%20south&amp;author=Packer&amp;publication_year=2010" title="The SciELO open access: a gold way from the south" data-jats-ref-type="bibr" data-jats-rid="ref-39">Packer, 2010</a>).</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">The oaDOI system</h4>
+ <p id="p-31">We assigned the categories above by calling the oaDOI service with a DOI for each item. The oaDOI returns a link to a legally-available OA version of the article, when one is available (<a class="ext-link" href="https://oadoi.org/" data-jats-ext-link-type="uri">https://oadoi.org/</a>). It contains records for all 88 million Crossref DOIs.<a class="xref xref-fn" href="#fn-3" data-jats-ref-type="fn" data-jats-rid="fn-3"><sup>3</sup></a> The oaDOI service crawls, aggregates, normalizes, and verifies data from many sources including PMC (<a class="ext-link" href="https://www.ncbi.nlm.nih.gov/pmc/" data-jats-ext-link-type="uri">https://www.ncbi.nlm.nih.gov/pmc/</a>), BASE (<a class="ext-link" href="https://www.base-search.net/about/en/" data-jats-ext-link-type="uri">https://www.base-search.net/about/en/</a>), DOAJ (<a class="ext-link" href="https://doaj.org/" data-jats-ext-link-type="uri">https://doaj.org/</a>), and thousands of institutional repositories and publishers. The oaDOI system offers a fast, free API with no rate-limits, allowing it to support a variety of other services and tools. At the time of writing, oaDOI processes approximately 500,000 requests daily–roughly twice the daily uses of Sci-Hub<a class="xref xref-fn" href="#fn-4" data-jats-ref-type="fn" data-jats-rid="fn-4"><sup>4</sup></a> (<a class="xref xref-bibr" href="https://doi.org/10.1126%2Fscience.352.6285.508" title="Who’s downloading pirated papers? Everyone" data-jats-ref-type="bibr" data-jats-rid="ref-13">Bohannon, 2016</a>; <a class="xref xref-bibr" href="https://doi.org/10.7287%2Fpeerj.preprints.3100v1" title="Sci-Hub provides access to nearly all scholarly literature (No. e3100v1)" data-jats-ref-type="bibr" data-jats-rid="ref-29">Himmelstein et al., 2017</a>). The majority of this volume comes from around 700 academic libraries, who use oaDOI to help readers find articles where the library has no subscription access, addressing the discoverability problem (<a class="xref xref-bibr" href="https://doi.org/10.1080%2F19322909.2013.795426" title="Journal article retrieval in an age of Open Access: how journal indexes indicate Open Access articles" data-jats-ref-type="bibr" data-jats-rid="ref-16">Chen, 2013</a>). The oaDOI service also powers the Unpaywall browser extension, which helps readers to find legal OA copies of paywalled articles as they browse; Unpaywall currently has over 80,000 active users. The oaDOI codebase is open source, and the service is free and open via an open API.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">Accuracy of oaDOI</h4>
+ <p id="p-34">To assess the accuracy of our automated OA determination, a random subsample of 500 articles were chosen from our main “Crossref-DOI†sample, described below. We manually searched the internet for each article in our subsample to determine if the paper was freely available on the publisher’s website, or on another website, such as an institutional repository, an academic social networking site, or on a personal webpage. DOIs were resolved by appending the DOI to “<a class="ext-link" href="https://doi.org/" data-jats-ext-link-type="uri">https://doi.org/</a>â€. If the full text was available through that link, articles were marked as being freely available from the publisher’s site. If articles required a subscription, the title of the article was entered into Google Scholar (GS) and into Google to find alternative versions (i.e., preprints or archived copies). If the fulltext was found on any publisher page or OA repository, these were marked as being freely available from an archive. If the only available open copy was hosted on an academic social network (like Academia.edu or ResearchGate), this was noted but for the sake of the study these were <i>not</i> counted as any category of OA, and were instead added to the “Closed†category;</p>
+ <p id="p-35">The performance of oaDOI is summarized below, compared to these manual accuracy checks. The complete dataset behind this summary is available in supplementary information. Using this data we calculated the recall and precision of the system. “Recall†asks the question, “when an article is open, how often does oaDOI correctly identify it as open?†The recall of the service is 77.0%, meaning that 77% of the truly open articles are correctly identified as open by oaDOI. “Precision†asks the question, “When oaDOI says an article is open, how often is it correct?†The precision of the system is 96.6%, meaning that 96.6% of the time that oaDOI reports an article is open, it really is open.</p>
+ <p id="p-36">These results can be roughly compared to the recall of 86.4% and precision of 99.1% reported by <a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al. (2014)</a> for their automated system. Their accuracy estimate was also calculated based on a sample of 500 data points, giving each estimate a margin of error of ±4.5 percentage points. The Archambault study used a narrower date window for their sample (starting in 1996, versus our Crossref-DOI sample which was not time restricted), resulting in a more homogeneous task, which may partially explain their somewhat better performance.</p>
+ <p id="p-37">The oaDOI service is optimized for high precision, rather than high recall. The very high precision of oaDOI means that any estimates derived from the database can be considered a <i>conservative</i> estimate of the actual percentage of open access in the literature. That is, we can safely assume that when oaDOI reports a certain percentage of open access, the real percentage is <i>at least</i> that high—and almost certainly higher given that recall was less than perfect. Put another way, oaDOI delivers very few false positives (where it mistakenly calls an article open), but a relatively high number of false negatives (where it mistakenly calls an article closed) (<a class="xref xref-table" href="#table-1" data-jats-ref-type="table" data-jats-rid="table-1">Table 1</a>). Future improvements to the system are planned that will improve recall while keeping precision high.</p>
+ <figure class="table-wrap" id="table-1"><div class="caption">
+<span class="caption-label">Table 1: </span>
+ <div class="title">Accuracy of the prototype version of the oaDOI service used in this study.</div>
+ </div>
+
+ <div class="table-container"><table class="table table-bordered table-condensed table-hover">
+ <colgroup>
+ <col>
+ <col>
+ <col>
+ <col>
+ </colgroup>
+ <thead>
+ <tr>
+ <th></th>
+ <th>oaDOI reports Open</th>
+ <th>oaDOI reports Closed</th>
+ <th>Manual count Total (ground truth)</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Open</td>
+ <td>144</td>
+ <td>43</td>
+ <td>187</td>
+ </tr>
+ <tr>
+ <td>Closed</td>
+ <td>5</td>
+ <td>308</td>
+ <td>313</td>
+ </tr>
+ <tr>
+ <td>Total</td>
+ <td>149</td>
+ <td>351</td>
+ <td style="text-align:left;;">500</td>
+ </tr>
+ </tbody>
+ </table></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/table-1" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/table-1</a>
+</div>
+ </figure>
+ </section>
+ </section>
+ <section class="sec">
+ <h3 class="heading">Study samples</h3>
+ <p id="p-38">Three samples of DOI-assigned scholarly resources are summarized in <a class="xref xref-table" href="#table-2" data-jats-ref-type="table" data-jats-rid="table-2">Table 2</a> and described further below.</p>
+ <section class="sec">
+ <h4 class="heading">Crossref sample</h4>
+ <p id="p-39">The first sample, “Crossref-DOIs,†is a random sample of 100,000 journal articles with Crossref DOIs, across all publication years. There are approximately 88 million Crossref DOIs in total as of May 2017. In order to exclude books, datasets, and other non-article content, we sampled only items whose “type†was listed as “journal-article†in the Crossref API metadata; there are 66 million of these. To verify the accuracy of Crossref metadata, we manually checked 150 items assigned to type “journal-article,†and determined that 93% were indeed journal articles; the remaining 7% were mostly journal front-matter such as tables of content or instructions to authors.</p>
+ <figure class="table-wrap" id="table-2"><div class="caption">
+<span class="caption-label">Table 2: </span>
+ <div class="title">Summary of samples used in this study.</div>
+ </div>
+
+ <div class="table-container"><table class="table table-bordered table-condensed table-hover table-text" data-jats-content-type="text">
+ <colgroup>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ </colgroup>
+ <thead>
+ <tr>
+ <th>Sample name</th>
+ <th>Sample size</th>
+ <th>Population sampled</th>
+ <th>Purpose</th>
+ <th>Population size</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Crossref-DOIs</td>
+ <td>100,000</td>
+ <td>All journal articles with Crossref DOIs, all years.</td>
+ <td>Estimate percentage of the literature that is OA.</td>
+ <td>66,560,153</td>
+ </tr>
+ <tr>
+ <td>WoS-DOIs</td>
+ <td>100,000</td>
+ <td>All citable WoS articles with DOIs, 2009–2015.</td>
+ <td>Estimate citation impact of recent OA papers, and also OA prevalence by discipline.</td>
+ <td>8,083,613</td>
+ </tr>
+ <tr>
+ <td>Unpaywall-DOIs</td>
+ <td>100,000</td>
+ <td>All articles accessed by Unpaywall users over a 1-week period in 2017.</td>
+ <td>Estimate percentage of OA experienced by users of the Unpaywall extension.</td>
+ <td>213,323</td>
+ </tr>
+ </tbody>
+ </table></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/table-2" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/table-2</a>
+</div>
+ </figure>
+ <p id="p-40">The purpose of this sample is to roughly proxy the scholarly literature as a whole. As such, it has strengths and weaknesses. One weakness is that although Crossref includes information on citation counts and discipline categorization, we found these to be quite incomplete, and therefore not useful for the present study. Another is that researchers in the scientometrics and OA fields have largely relied on other indexes, particularly Scopus and Web of Science (WoS), to represent the literature as a whole; this makes our results more difficult to compare to previous work. Finally, DOIs are known to be less frequently assigned by publishers in certain disciplines (like humanities; <a class="xref xref-bibr" href="https://doi.org/10.1016%2Fj.joi.2015.11.008" title="Availability of digital object identifiers (DOIs) in web of science and scopus" data-jats-ref-type="bibr" data-jats-rid="ref-25">Gorraiz et al., 2016</a>), in certain geographic regions (particularly the developing world), and among older articles (<a class="xref xref-bibr" href="https://doi.org/10.1007%2Fs11192-016-2225-6" title="Availability of digital object identifiers in publications archived by PubMed" data-jats-ref-type="bibr" data-jats-rid="ref-14">Boudry &amp; Chartron, 2017</a>); consequently, these segments will be underrepresented in our sample. This said, Scopus and WoS are also known to underrepresent important segments of the literature (<a class="xref xref-bibr" href="https://doi.org/10.1007%2Fs11192-015-1765-5" title="The journal coverage of Web of Science and Scopus: a comparative analysis" data-jats-ref-type="bibr" data-jats-rid="ref-37">Mongeon &amp; Paul-Hus, 2016</a>), and so this failing is not limited to Crossref. Moreover, the Crossref sample has important advantages of its own over other indexes. While no sample of the scholarly literature will be complete in every regard, the Crossref index is more expansive than other sources: in July 2017 there were 67 million journal articles indexed in Crossref compared to 30 million in Scopus (<a class="ext-link" href="https://www.elsevier.com/solutions/scopus/content" data-jats-ext-link-type="uri">https://www.elsevier.com/solutions/scopus/content</a>). Also, Crossref has the advantage of being entirely free and open to use, while Scopus and WoS are subscription-access databases; this allows the study data to also be free and open, promoting replication and reuse of our results in further research. However, we did turn to the subscription-access WoS in order to answer questions about the discipline and citation counts of OA articles, since Crossref data is lacking in these areas.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">WoS sample</h4>
+ <p id="p-41">The second sample, “WoS-DOIsâ€, is a random sample of 100,000 journal articles with DOIs that are indexed by Web of Science. The sample was drawn from a local version of the WoS database at the Observatoire des sciences et des technologies (OST) at the Université du Québec à Montréal. Only articles that WoS defines as “citable items†are included in the sample; this excludes non-peer reviewed content such as editorial material and news items. This sample is restricted to articles published between 2009 and 2015, due to DOI availability constraints. The sample of 100,000 articles is randomly drawn from a population of 8 million articles and reviews with a DOI in WoS published between 2009 and 2015 as of May 2017.</p>
+ <p id="p-42">Because the WoS sample is restricted to certain publication years, due to availability of DOIs in the WoS database, this sample is unsuitable for estimating the proportion of the total literature that is OA. However, it is more useful than the Crossref sample in some ways: the WoS sample included accurate discipline information for each article (described below), and also citation counts. Therefore we use the WoS sample to assess OA prevalence by discipline and also the citation impact of recent OA papers. We do not encourage comparisons between the OA percentages in the WoS sample and the Crossref sample, because of large differences in the sampling frames.</p>
+ <p id="p-43">Documents in the WoS-DOIs sample were classified using the National Science Foundation (NSF) journal classification system. This system assigns every journal exactly one “discipline†(a high-level categorization) and exactly one “specialty†(a finer-grained categorization). Because this is a journal-level classification, all articles from a given journal are assigned the same discipline and specialty as the journal. A downside of this approach is that the system classifies multidisciplinary journals (e.g., Nature, PNAS, PLOS ONE) as “biomedical researchâ€, despite their publishing many articles from other fields.<a class="xref xref-fn" href="#fn-5" data-jats-ref-type="fn" data-jats-rid="fn-5"><sup>5</sup></a> In these cases, we used a ground-up, article-by-article classification approach. Each article published in a list of multidisciplinary journals was assigned to the NSF specialty which appeared most frequently in its own reference list. In other words, papers published in multidisciplinary journals were classified at the article level (instead of at the journal level) to the subject area which they cite most frequently.<a class="xref xref-fn" href="#fn-6" data-jats-ref-type="fn" data-jats-rid="fn-6"><sup>6</sup></a> </p>
+ <p id="p-46">We assess the relative impact of open and closed articles, using citations as an indicator of their scholarly impact. There are several properties of articles, however, that can confound this kind of comparison. Chief among these are the article’s discipline (some fields are much more cited than others) and its age (older articles have had more time to gather citations). In order to address this, we computed a normalized expected number of citations for each article, based on its age and its NSF specialty, by comparing it to the average citations for similar articles.<a class="xref xref-fn" href="#fn-7" data-jats-ref-type="fn" data-jats-rid="fn-7"><sup>7</sup></a> </p>
+ <p id="p-48">Using this approach, each article receives an average relative citation (ARC). An ARC of 1.0 indicates that a document was cited according to expectations based on documents published in the same year and NSF specialty, while an ARC above or below 1.0 indicates that the citation impact was above or below world average, respectively. Using these field-normalized citation rates, citation impact can be compared across scientific disciplines as well as across years. We can also compute mean ARCs for groups of articles, like “all open articles†or “all closed articlesâ€, allowing us to compare normalized impact between these two groups. Analyzing results on the level of NSF disciplines, data is not shown for the Humanities (<i>n</i> = 1,091) and Arts (<i>n</i> = 164), because they are underrepresented both in the Web of Science and in terms of DOI coverage.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">Unpaywall sample</h4>
+ <p id="p-49">The third sample, “Unpaywall-DOIsâ€, is a random sample of 100,000 articles accessed by users of the free, open-source Unpaywall browser extension, gathered over a one-week time window. We collected IP addresses and DOI requests made to the oaDOI service through the Unpaywall browser extension during the week of June 5–June 11, 2017. In that time period there were 374,703 total accesses, 213,323 unique DOIs, and 42,894 unique IP addresses gathered in total, from which 100,000 unique DOIs were randomly sampled.</p>
+ <p id="p-50">This sample was used to assess the prevalence of OA experienced by users of the Unpaywall extension (since Unpaywall uses oaDOI data to find OA). It is a convenience sample of what articles people are interested in reading, and thereby lets us roughly estimate the percent of this literature that is OA. The sample has serious limitations, however: we don’t know the demographics of Unpaywall users, and we are aware of a bias towards users from the US (as determined by the IP addresses). As such, we cannot accurately generalize the results by education level, discipline, or purpose in reading the scholarly literature.</p>
+ </section>
+ </section>
+ </section>
+ <section class="sec" id="results">
+ <h2 class="heading">Results</h2>
+ <section class="sec">
+ <h3 class="heading">RQ1. What percent of the literature is open access?</h3>
+ <section class="sec">
+ <h4 class="heading">How much of the literature is OA?</h4>
+ <p id="p-51">We found 27.9% (95% CI [27.6–28.2]) of all DOI-assigned journal articles are OA, using the Crossref-DOI sample. Based on this, we estimate there are 18.6 million OA articles with Crossref DOIs (95% CI [18.4–18.8]). This is the total population of OA articles that can be identified and accessed by oaDOI. Given our finding (described in Methods above) that the oaDOI service finds 77% of OA compared to manual searches, we can further estimate that an additional 3.5 million articles are OA but not detectable by this version of oaDOI.</p>
+ <p id="p-52">People reading the literature using the Unpaywall browser extension encounter a significantly higher proportion of OA: we found that 47.0% (95% CI [46.7–47.3]) of the Unpaywall-accessed sample is open access. The main reason for this is article age: since this sample is based on the behavior of actual readers, it is disproportionately comprised of recent articles. In fact, half the accessed articles were published in the last 2 years. Recent articles are much more likely to be OA than their older counterparts (see Results ‘How does Open Access vary by year of publication?’ below).</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">What types of Open Access are most common?</h4>
+ <p id="p-53">The proportion of OA by subtype is relatively similar across the samples, as shown in <a class="xref xref-fig" href="#fig-1" data-jats-ref-type="fig" data-jats-rid="fig-1">Fig. 1</a> and <a class="xref xref-table" href="#table-3" data-jats-ref-type="table" data-jats-rid="table-3">Table 3</a>. Green OA represents a relatively small percentage of OA articles in all three samples. This is partly because self-archived articles are only counted as Green where there is no publisher-hosted option available; that is, Green OA is sometimes “shadowed†by Gold, Bronze, or Hybrid articles. Bronze is the most common OA subtype in all the samples, which is particularly interesting given that few studies have highlighted its role. We manually inspected a small sample of Bronze articles in order to understand this subcategory more; we found that while many Bronze articles were Delayed OA from toll-access publishers, nearly half were hosted on journals that published 100% of content as free-to-read but were <i>not</i> listed on the DOAJ and did not formally license content (using CC-BY or any other license). Such journals might be better described as “Dark Gold†or “Hidden Gold†than Bronze. A more complete examination of Bronze falls outside the scope of this study, and therefore further investigation will be undertaken in future work.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-1"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 1: Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-small.jpg 355w" data-image-id="fig-1" alt="Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="230"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 1: </span>Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample.</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-1-full.png" class="btn btn-mini" download="peerj-4375-fig-1.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-1" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-1</a>
+</div>
+</div></figcaption></figure>
+ <figure class="table-wrap" id="table-3"><div class="caption">
+<span class="caption-label">Table 3: </span>
+ <div class="title">Percent of the literature that is OA, by type, in three samples of 100,000 journal articles, with 95% confidence intervals.</div>
+ </div>
+
+ <div class="table-container"><table class="table table-bordered table-condensed table-hover">
+ <colgroup>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ <col>
+ </colgroup>
+ <thead>
+ <tr>
+ <th>Access type</th>
+ <th style="text-align:center;" colspan="2">Crossref-DOI All journal articles with Crossref DOIs, all years. (“Articles with DOIs†in <a class="xref xref-fig" href="#fig-1" data-jats-ref-type="fig" data-jats-rid="fig-1">Fig. 1</a>)</th>
+ <th style="text-align:center;" colspan="2">WoS-DOIs All citable WoS articles with DOIs, 2009–2015</th>
+ <th style="text-align:center;" colspan="2">Unpaywall-DOIs All articles accessed by Unpaywall users over a 1-week period in 2017</th>
+ </tr>
+ <tr>
+ <th></th>
+ <th>Estimate</th>
+ <th>95% CI</th>
+ <th>Estimate</th>
+ <th>95% CI</th>
+ <th>Estimate</th>
+ <th>95% CI</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>OA (all types)</td>
+ <td>27.9%</td>
+ <td>27.6–28.2</td>
+ <td>36.1%</td>
+ <td>36.0–36.2</td>
+ <td>47.0%</td>
+ <td>46.7–47.3</td>
+ </tr>
+ <tr>
+ <td>Bronze OA</td>
+ <td>16.2%</td>
+ <td>16.0–16.5</td>
+ <td>12.9%</td>
+ <td>12.6–13.2</td>
+ <td>15.3%</td>
+ <td>15.0–15.6</td>
+ </tr>
+ <tr>
+ <td>Hybrid OA</td>
+ <td>3.6%</td>
+ <td>3.3–3.9</td>
+ <td>4.3%</td>
+ <td>4.0–4.6</td>
+ <td>8.3%</td>
+ <td>8.0–8.6</td>
+ </tr>
+ <tr>
+ <td>Gold OA</td>
+ <td>3.2%</td>
+ <td>2.9–3.5</td>
+ <td>7.4%</td>
+ <td>7.1–7.7</td>
+ <td>14.3%</td>
+ <td>14.0–14.6</td>
+ </tr>
+ <tr>
+ <td>Green OA</td>
+ <td>4.8%</td>
+ <td>4.5–5.1</td>
+ <td>11.5%</td>
+ <td>11.2–11.8</td>
+ <td>9.1%</td>
+ <td>8.8–9.4</td>
+ </tr>
+ <tr>
+ <td>Closed</td>
+ <td>72.0%</td>
+ <td>71.8–72.4</td>
+ <td>63.9%</td>
+ <td>63.8–64.0</td>
+ <td>53.0%</td>
+ <td>52.7–53.3</td>
+ </tr>
+ </tbody>
+ </table></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/table-3" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/table-3</a>
+</div>
+ </figure>
+ </section>
+ <section class="sec">
+ <h4 class="heading">How does Open Access vary by year of publication?</h4>
+ <p id="p-54"><a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Figure 2</a> presents the number (<a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Fig. 2A</a>) and proportion (<a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Fig. 2B</a>) of papers by access category and publication date. Articles published in the last 20 years are increasingly OA, and this trend shows no sign of slowing. More recent articles are more likely to be OA, with the most recent year examined also containing the most OA: 44.7% of 2015 articles are OA (95% CI [43.3–46.2%]), including 17.6% Bronze (95% CI [16.2–19.1]), 9.4% Hybrid (95% CI [8.0–10.9]), 11.3% Gold (95% CI [9.9–12.8]), and 6.3% Green (95% CI [4.9–7.8]). Well over one million OA papers were published in 2015. This growth trend has largely been driven by dramatic growth in Gold and Hybrid OA since the year 2000. However, more than 20% of papers published before the digital age are also freely available. The majority of these older OA papers are Bronze, and based on their age they are probably more precisely Delayed OA, although additional investigation will be required to confirm this. Bronze OA remains remarkably constant as a proportion of the literature for all publication years examined.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-2"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 2: Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-small.jpg 355w" data-image-id="fig-2" alt="Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="216"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 2: </span>Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs.</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-2-full.png" class="btn btn-mini" download="peerj-4375-fig-2.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-2" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-2</a>
+</div>
+</div></figcaption></figure>
+ <p id="p-55">The number and proportion of Green papers must be interpreted with particular caution, due to several factors. First, unlike publisher-hosted OA (Gold, Bronze, and Hybrid), the date when the Green article <i>became open</i> is generally different from the date the article was <i>first published</i>. Authors often self-archive articles years after (or before, in the case of preprints) their original publication, leading to so-called “backfilling†of Green stocks (<a class="xref xref-bibr" href="http://science-metrix.com/sites/default/files/science-metrix/publications/d_1.8_sm_ec_dg-rtd_proportion_oa_1996-2013_v11p.pdf" title="Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013" data-jats-ref-type="bibr" data-jats-rid="ref-5">Archambault et al., 2014</a>). Consequently, the graph cannot show the growth of Green OA over time; this would require longitudinal analysis over several years, and so is outside the scope of this analysis. Instead it shows the number and proportion of Green OA by publication year of the article. Second, many articles cannot be legally self-archived until a certain number of months after publication; this embargoing likely influences the apparent plateau in Green shown in <a class="xref xref-fig" href="#fig-2" data-jats-ref-type="fig" data-jats-rid="fig-2">Fig. 2</a>. Finally, as noted earlier, many self-archived articles would otherwise be Green except for being “shadowed†by a Gold, Bronze, or Hybrid of the same article elsewhere. For more detail on the growth of shadowed Green OA, see <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Figs. SA2</a> and <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">SA3</a>.</p>
+ </section>
+ <section class="sec">
+ <h4 class="heading">How does Open Access vary by publisher?</h4>
+ <p id="p-56">We analyzed a subset of the Crossref-DOIs sample by publisher (as listed on the Crossref metadata record) to understand how the extent and types of OA are common across publishers for recent publications (between 2009 and 2015). As we can see in <a class="xref xref-fig" href="#fig-3" data-jats-ref-type="fig" data-jats-rid="fig-3">Fig. 3A</a>, the largest publishers by volume publish the most OA articles by volume, led by Elsevier. As a proportion of all articles published (<a class="xref xref-fig" href="#fig-3" data-jats-ref-type="fig" data-jats-rid="fig-3">Fig. 3B</a>), however, PLOS and Hindawi distinguish themselves as being the only publishers in the top 20 with 100% OA. More than half of the papers published by Oxford University Press, Nature Publishing Group, IOP Publishing, and the American Physical Society (APS) are freely available online. In the case of APS this is largely driven by content available through repositories such as arXiv (for more details on repositories, see <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Fig. SA1</a>).</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-3"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 3: Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-small.jpg 355w" data-image-id="fig-3" alt="Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="282"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 3: </span>Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015.</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-3-full.png" class="btn btn-mini" download="peerj-4375-fig-3.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-3" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-3</a>
+</div>
+</div></figcaption></figure>
+ </section>
+ <section class="sec">
+ <h4 class="heading">How does Open Access vary across disciplines?</h4>
+ <p id="p-57">We used the WoS-DOIs sample to examine OA prevalence differences by discipline, because of the easy availability of discipline metadata in the WoS index. <a class="xref xref-fig" href="#fig-4" data-jats-ref-type="fig" data-jats-rid="fig-4">Figure 4</a> displays our results. More than half of the publications are freely available in biomedical research and mathematics, while in chemistry and engineering &amp; technology less than 20% of the papers are freely available. <a class="xref xref-fig" href="#fig-4" data-jats-ref-type="fig" data-jats-rid="fig-4">Figure 4</a> also highlights the popularity of Green OA in disciplines like physics and mathematics, where more than one fifth of papers are available only through online repositories (mainly arXiv). Hybrid articles are particularly prevalent in mathematics (9.4%), biomedical research (8.1%) and clinical medicine (6.3%), while authors in biomedical research (15.3%), health (11.7%), mathematics (11.2%) and clinical medicine (10.3%) often publish in Gold journals.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-4"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 4: Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities)." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-small.jpg 355w" data-image-id="fig-4" alt="Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities)." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="241"></a></div>
+<figcaption itemprop="description">
+ <h5 class="heading">
+<span class="caption-label">Figure 4: </span>Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities).</h5>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-4-full.png" class="btn btn-mini" download="peerj-4375-fig-4.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-4" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-4</a>
+</div>
+</div></figcaption></figure>
+ <p id="p-58">Large variations can also be observed on the more detailed level of NSF specialties (<a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Fig. SA5</a>). At more than 80% of OA articles, astronomy &amp; astrophysics (87%), fertility (86%), tropical medicine (84%), and embryology (83%) were the specialties where access to literature was the most open. At the other end of the spectrum are pharmacy (7%), inorganic &amp; nuclear chemistry (7%), and chemical engineering (9%), where publications were hidden behind a paywall for more than 90% of papers. More detail on these and other NSF specialties can be seen in <a class="xref xref-supplementary-material" href="#supp-1" data-jats-ref-type="supplementary-material" data-jats-rid="supp-1">Fig. SA1</a>.</p>
+ </section>
+ </section>
+ <section class="sec">
+ <h3 class="heading">RQ2. What is the scholarly impact of open access?</h3>
+ <p id="p-59">Comparing the average relative citation impact of different access categories, the OACA is corroborated: Papers hidden behind a paywall were cited 10% below world average (ARC = 0.90), while those that are freely available obtain, on average, 18% more citations than what is expected (ARC = 1.18). However, citation impact differs between the different manners in which papers are made available for free: those that are only available as Green OA (ARC = 1.33) and Hybrid OA papers (ARC = 1.31) are cited the most with an impact of more than 30% above expectations, those available as Bronze are cited 22% above world average, while papers published as Gold OA obtain an ARC of 0.83. This constitutes an average relative citation impact of 17% below world average and 9% below that of articles hidden behind a paywall. <a class="xref xref-fig" href="#fig-5" data-jats-ref-type="fig" data-jats-rid="fig-5">Figure 5</a> below describes these findings.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-5"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 5: Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-small.jpg 355w" data-image-id="fig-5" alt="Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="388"></a></div>
+<figcaption itemprop="description">
+ <h4 class="heading">
+<span class="caption-label">Figure 5: </span>Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015.</h4>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-full.png" class="btn btn-mini" download="peerj-4375-fig-5.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-5" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-5</a>
+</div>
+</div></figcaption></figure>
+ <p id="p-60">These trends vary over time, however, as shown in <a class="xref xref-fig" href="#fig-6" data-jats-ref-type="fig" data-jats-rid="fig-6">Fig. 6</a>. While the ARC of closed access papers remains below world average throughout the period studied, it increased from .86 in 2009 to .93 over in 2014 and 2015. Meanwhile, when looking across all open types, the mean citation rate is consistently above the world average, fluctuating between 1.15 and 1.22. This fluctuation is guided by differences between the access types, with the impact of Hybrid OA papers increasing over the time period. While Green OA papers’ mean citation rate remain relatively stable, the highest impact, for 2015, is obtained by Bronze and Hybrid. The only form of open for which mean impact has decreased steadily over time is Gold. The results for more recent years are only based on a short citation window, however, and results might change over the next years as citations accumulate.</p>
+ <figure class="fig" itemprop="image" itemscope="itemscope" itemtype="https://schema.org/ImageObject" id="fig-6"><div class="image-container"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-2x.jpg" title="View the full image" class="fresco" data-fresco-caption="Figure 6: Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication." data-fresco-group="figure" data-fresco-options="fit: 'width', ui: 'outside', thumbnails: false, loop: true, position: true, overflow: true, preload: false"><img class="graphic" src="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-1x.jpg" itemprop="contentUrl" sizes="(min-width: 1200px) 581px, (max-width: 1199px) and (min-width: 980px) 462px, (max-width: 979px) and (min-width: 768px) 347px, (max-width: 767px) calc(100vw - 50px)" srcset="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-small.jpg 355w" data-image-id="fig-6" alt="Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication." data-full="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-full.png" data-thumb="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-thumb.jpg" data-original="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6.png" data-image-type="figure" data-jats-mimetype="image" data-jats-mime-subtype="png" width="600" height="465"></a></div>
+<figcaption itemprop="description">
+ <h4 class="heading">
+<span class="caption-label">Figure 6: </span>Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication.</h4>
+ <div class="figcaption-footer">
+<div class="article-image-download"><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-6-full.png" class="btn btn-mini" download="peerj-4375-fig-6.png" itemprop="url"><i class="icon-large icon-picture"> </i> Download full-size image</a></div>
+<div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/fig-6" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/fig-6</a>
+</div>
+</div></figcaption></figure>
+ </section>
+ </section>
+ <section class="sec">
+ <h2 class="heading">Discussion and Conclusion</h2>
+ <p id="p-61">Access to scholarly literature is at the heart of current debates in the research community. Research funders are increasingly mandating OA dissemination to their grantees while, at the same time, the growth in toll-access subscriptions costs have prompted more and more university libraries to cancel subscriptions. In this context, several tools have been developed to provide access–both legally and illegally–to scholarly literature. Using data from one of these tools (oaDOI), this paper addresses two broad research questions: what percent of the literature is OA and how does it vary by type of OA, and what is the mean scholarly impact of papers diffused through this form. Three large samples were used, to assess different aspects of OA patterns: (1) 100,000 articles that have a Crossref DOIs, which allows us to assess the relative proportion of OA across all existing literature; (2) 100,000 WoS-indexed journals articles that have a DOI, which allows us to assess the scholarly impact of OA and non OA papers; (3) 100,000 articles accessed by users through the Unpaywall browser extension, which lets us assess the proportion of OA papers found by users of this free tool.</p>
+ <p id="p-62">We found that 28% of all journal articles are freely available online (Crossref-DOI sample). Encouragingly for proponents of OA, this proportion has been growing steadily over the last 20 years, driven particularly by growth in Gold and Hybrid. Articles from 2015, the most recent year examined, had the highest proportion OA (45%), as well as the largest absolute number of OA articles published in a single year. This disproportionate level of OA in recent years, combined with readers’ preference for more recent articles, leads to a felicitous situation for readers: the proportion of OA they <i>experience</i> as they browse and search is better than the overall percentage of OA across the literature as a whole. Users of the Unpaywall browser extension, which gives individual readers access to the oaDOI service, encounter OA articles nearly half (47%) of the time. The effect almost certainly extends beyond Unpaywall users; one may assume readers in general also favor newer articles, and therefore benefit from the growth of Gold, Bronze, and Hybrid OA among recent papers, even without using Unpaywall. More studies of readership data from other sources would be useful to quantify this further.</p>
+ <p id="p-63">Interestingly, we found that the majority of OA articles are Bronze–hosted on publisher websites, either without a license at all or without an open license. This is surprisingly high given that Bronze is relatively little-discussed in the OA literature, and suggests that this OA category deserves further attention from the OA community. In particular, Bronze OA may be significant in a policy context, since, unlike other publisher-hosted OA, Bronze articles do not extend any reuse rights beyond reading, making them Gratis OA. Much more research is needed into the characteristics of Bronze OA. How many Bronze articles are licensed openly, but do not make their license available? Is Bronze disproportionately non-peer-reviewed content? How much of Bronze OA is also Delayed OA? How much Bronze is Promotional, and how transient is the free-to-read status of this content? How many Bronze articles are published in “hidden gold†journals that are not listed in the DOAJ? Why are these journals not defining an explicit license for their content, and are there effective ways to encourage this? These and other questions are outside the scope of this study but may provide fruitful insights for future OA research and policy.</p>
+ <p id="p-64">Only about 7% of the literature overall (and 17% of the OA literature) is Green. This is may at first seem disappointing, given years of advocacy focused on Green OA as well as ongoing growth in the number of Green OA mandates (<a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=Anatomy%20of%20green%20open%20access&amp;author=Bj%C3%B6rk&amp;publication_year=2014" title="Anatomy of green open access" data-jats-ref-type="bibr" data-jats-rid="ref-11">Björk et al., 2014</a>). However, the full context of Green OA provides reasons for optimism. First, many papers are archived in repositories but are not counted as Green in this analysis because they are also available on the publisher site as Hybrid, Gold, or Bronze versions. These “shadowed Green†copies provide a useful safety net that preserves access in cases where publishers rescind it (as could potentially happen with Delayed OA and other Bronze articles). Further research is needed to determine the prevalence of shadowed Green OA in various disciplines. Second, the phenomenon of “backfilling†(authors self-archiving content published across all years, not just the current one) means that although the percentage graph of Green OA does not show the same year-over-year slope as Gold or Hybrid, the line itself may be rising across <i>all</i> years as authors gradually self-archive papers from years or even decades ago. This assumption is supported by results reported by <a class="xref xref-bibr" href="https://digitalcommons.unl.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&amp;httpsredir=1&amp;article=1028&amp;context=scholcom" title="Research impact of paywalled versus open access papers" data-jats-ref-type="bibr" data-jats-rid="ref-6">Archambault et al. (2016)</a>. Finally, the relatively low proportion of green OA encouragingly leaves room for continued growth. While most journals published by major publishers (Elsevier, Wiley, Springer, etc.) allow for self-archiving, research shows that only a small proportion of papers from these publishers actually are self-archived in OA repositories; for example, <a class="xref xref-bibr" href="https://scholar.google.com/scholar_lookup?title=Knowledge%20sharing%20in%20global%20health%20research;%20the%20impact,%20uptake%20and%20cost%20of%20open%20access%20to%20scholarly%20literature&amp;author=Smith&amp;publication_year=" title="Knowledge sharing in global health research; the impact, uptake and cost of open access to scholarly literature" data-jats-ref-type="bibr" data-jats-rid="ref-42">Smith et al. (in press)</a> report using a sample of Global Health Research papers that only 39% of them made use of available self-archiving rights.</p>
+ <p id="p-65">Our results confirm the Open Access Citation Advantage found by other studies: open articles receive 18% more citations than otherwise expected. While at least some of this boost is likely due to the fact that more access allows more people to read and hence cite articles they otherwise would not, causation is difficult to establish and there are many possible confounders. Most discussed is the so-called “selection bias postulateâ€, (<a class="xref xref-bibr" href="https://doi.org/10.1016%2Fj.joi.2007.04.001" title="Do open access articles have greater citation impact?" data-jats-ref-type="bibr" data-jats-rid="ref-18">Craig et al., 2007</a>) which suggests that authors choose only their most impactful work to make OA. The current study does not examine the cause or directionality of correlation, but does find that it exists in a very large sample that is relatively representative of the literature as a whole. Funder requirements may also play a role in the observed citation advantage: high-profile funders are more likely to have an OA publishing requirement; at the same time, well funded studies are independently more likely to receive more citations than poorly funded studies (<a class="xref xref-bibr" href="https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/" title="Measuring the scientific output and impact of NIGMS grants" data-jats-ref-type="bibr" data-jats-rid="ref-7">Berg, 2010</a>). Interestingly, Gold articles are actually cited <i>less</i>, likely due to an increase in the number of newer and smaller OA journals. Some of these journals are from regions of the world not historically indexed by WoS, are published in languages other than English, or might be considered to be less prestigious because they have not had time to become established or accumulate citations (<a class="xref xref-bibr" href="http://www.science-metrix.com/pdf/SM_EC_OA_Availability_2004-2011.pdf" title="Proportion of open access peer-reviewed papers at the European and world levels–2004–2011" data-jats-ref-type="bibr" data-jats-rid="ref-4">Archambault et al., 2013</a>). On the flip side, the citation disadvantage of Gold OA is likely also affected by the continued growth of so-called ‘mega journals’ such as PLOS ONE (<a class="xref xref-bibr" href="http://journals.plos.org/plosone/s/reviewer-guidelines#loc-criteria-for-publication" title="Reviewer guidelines: criteria for publication" data-jats-ref-type="bibr" data-jats-rid="ref-40"> PLOS, 2018</a>). Whatever the reason, the lower impact of Gold means the overall citation advantage is strongly driven by Green, Hybrid, and Bronze content. In sum, while several factors can affect the observed differences in citation rates, and causation remains difficult to establish, the fact remains that scholars are much more likely to read and cite papers to which they have access than those that they cannot obtain. Hopefully the existence of a free, open index of OA content will help support further research into the OACA question.</p>
+ <p id="p-66">The relatively high percentage of OA found in this study, particularly among readers of the free Unpaywall extension, has important potential implications for academic libraries. Increasingly, these libraries are under pressure to meet growing prices of “Big Deal†subscription packages, and the once-unthinkable outcome of canceling these Big Deals is becoming an increasingly realistic option. In this environment, knowing that around half of the literature of interest is available without any subscription may tip the scales toward cancellation for some institutions–particularly given that this percentage seems to be growing steadily. Indeed, the Université de Montréal’s cancellation of their Taylor &amp; Francis subscription package (<a class="xref xref-bibr" href="http://www.bib.umontreal.ca/communiques/20170504-DC-annulation-taylor-francis-va.htm" title="UdeM Libraries cancel Big Deal subscription to 2231 periodical titles published by Taylor &amp; Francis Group" data-jats-ref-type="bibr" data-jats-rid="ref-48">Université de Montréal, 2017</a>) is particularly interesting, given that their cancellation announcement directly pointed faculty to Unpaywall and other tools to help them access OA content. This may seem a radical suggestion, but cancellation of subscription journals has long been part of the universal OA roadmap (<a class="xref xref-bibr" href="https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/" title="The forbidden forecast: thinking about open access and library subscriptions" data-jats-ref-type="bibr" data-jats-rid="ref-2">Anderson, 2017b</a>). Even when the percentage of OA is not enough to support outright cancellation, it may be enough to negotiate better subscription rates by supporting calculation of “OA-adjusted Cost Per Access†(<a class="xref xref-bibr" href="http://www.ala.org/acrl/sites/ala.org.acrl/files/content/conferences/confsandpreconfs/2017/LeveragingtheGrowthofOpenAccess.pdf" title="Leveraging the growth of open access in library collection decision making" data-jats-ref-type="bibr" data-jats-rid="ref-3">Antelman, 2017</a>). However, much more study is needed to see how OA availability varies across journals and Big Deal packages, along with praxis-oriented work building OA analysis tools that help librarians make cancellation choices.</p>
+ <p id="p-67">This study has several important limitations. Our dataset only includes journal articles with DOIs, which means that disciplines and geographical areas which rely more heavily on conference papers or articles without DOIs are underrepresented. Our Crossref sample includes about 7% journal “front matter†that the journal has assigned a DOI and Crossref labelled “journal article†but is actually a page describing the journal Editorial Board or similar. Our Bronze OA category includes articles published in OA journals which aren’t indexed in DOAJ; future work must identify these OA journals and classify such articles as Gold. As discussed in our definition of OA, when finding open copies we ignored free-to-read articles from academic social networks like ResearchGate and Academia.edu. The oaDOI system has some coverage of articles published on personal web pages, but this is quite limited compared to web-scale indexes like Google. The oaDOI system includes thousands of institutional and subject repositories, but there are some repositories that it misses. Our accuracy checks suggest that oaDOI, and therefore this study, are probably overlooking around 23% of OA otherwise discoverable using web searches, meaning that estimates in reported in this paper undercount OA by approximately 30%. Finally, our approach did not detect <i>when</i> articles were deposited into repositories. Because repositories are often backfilled with content that has been published many years ago, this study does not measure any increase/decrease in prevalence of Green OA over time, but only the proportion of Green OA by article publication date at the moment of data collection.</p>
+ <p id="p-68">In addition to the empirical results obtained, this paper clearly shows the potential of the oaDOI service for future research. The freely available oaDOI service provides scholars with the basis for assessing and monitoring the development of access to scholarly literature on a large scale, as well as the factors that affect it. For instance, our results show that the percentage of the literature available as OA is growing, and that articles diffused through this form are generally more cited than closed access articles. Several factors are likely to contribute to these trends; however, those remain poorly understood. Combined with other datasets–such as the WoS, Scopus, or Crossref–oaDOI allows one to assess at a large-scale the effects of various mandates on deposit rates, or to track the development of documents’ accessibility to determine, for example, when authors self-archive, or the sustainability of the promotional OA category. Aggregated at the level of journals and publishing platforms, these data can also provide librarians with indicators to help inform subscription cancellations and mitigate their effects. The application of the oaDOI algorithm on a large scale also allows for more complete analysis of the OA citation advantage across fields and time. As in <a class="xref xref-bibr" href="https://doi.org/10.1371%2Fjournal.pone.0013636" title="Self-selected or mandated, open access increases citation impact for higher quality research" data-jats-ref-type="bibr" data-jats-rid="ref-23">Gargouri et al. (2010)</a>, confounding factors could be mitigated by using article-level metadata to identify article pairs published in the same journal issue, on the same topic or published by the same authors at the same time. We hope that other scholars will dig deeper in those data to better understand OA dissemination and the factors that drive it. This is of utmost importance for the future of scholarly communication.</p>
+ </section>
+ <section class="sec" id="supplemental-information">
+ <h2 class="heading"> Supplemental Information</h2>
+ <div class="supplementary-material well well-small" id="supp-1" data-jats-mimetype="application" data-jats-mime-subtype="vnd.openxmlformats-officedocument.wordprocessingml.document">
+<h3 class="heading">Additional results</h3>
+
+ <div class="object-id article-component-doi">DOI: <a href="https://doi.org/10.7717/peerj.4375/supp-1" data-toggle="tooltip" title="Cite this object using this DOI">10.7717/peerj.4375/supp-1</a>
+</div>
+<div><a href="https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/appendix.docx" class="btn article-supporting-download" data-rel="supplement" download="appendix.docx" data-filename="appendix.docx"><i class="icon-large icon-download-alt"> </i> Download</a></div>
+</div>
+ </section>
+ </div>
+<div id="article-footnotes">
+<div class="fn article-footnote" id="fn-1"><span class="p">In the interest of full disclosure, it should be noted that two of the authors of the paper are the co-founders of Impactstory, the non-profit organization that developed oaDOI.</span></div>
+<div class="fn article-footnote" id="fn-2"><span class="p">Repositories that were included are those covered by the Bielefeld Academic Search Engine (BASE) in May 2017. A full listing of repositories can be found on their website at: <a class="ext-link" href="https://www.base-search.net/about/en/about_sources_date.php?menu=2&amp;submenu=1" data-jats-ext-link-type="uri">https://www.base-search.net/about/en/about_sources_date.php?menu=2&amp;submenu=1</a>
+ </span></div>
+<div class="fn article-footnote" id="fn-3"><span class="p">DOIs are short, unique identifiers for scholarly papers. Crossref is a nonprofit that helps a the DOI system, and is by far the largest supplier of academic DOIs in academia.</span></div>
+<div class="fn article-footnote" id="fn-4"><span class="p">Based on a Sci-Hub dataset released in 2016 (the most recent data available).</span></div>
+<div class="fn article-footnote" id="fn-5"><span class="p">These journals were identified by selecting journals with over a one thousand articles per year from those classified in the general “biomedical research†category. The full list of journals meeting these criteria were: PLOS ONE, Nature, Science, Scientific Reports, PNAS, Nature Communication, PeerJ, and Science Advances.</span></div>
+<div class="fn article-footnote" id="fn-6"><span class="p">Ties between frequently cited specialties were resolved randomly; that is, if a paper cites exactly the same amount of papers from two NSF specialties, it was assigned to one of the two at random</span></div>
+<div class="fn article-footnote" id="fn-7"><span class="p">Citations were normalized using the population of WoS articles and reviews with a DOI.</span></div>
+</div></main><footer class="back">
+ <section class="ack" id="acknowledgements"><h2 class="heading">Acknowledgements</h2>
+ <p>The authors would like to thank Dorothea Salo, Kristin Antelman, and John Sack for extensive and valuable comments on a draft of this article. The author order of JP and HP was determined by coin flip, as is their custom.</p>
+ </section>
+ <div class="sec" id="additional-information">
+ <h2 class="heading">Additional Information and Declarations</h2>
+ <div class="fn-group" data-jats-content-type="competing-interests">
+ <h3 class="heading">Competing Interests</h3>
+<div class="fn" id="conflict-1" data-jats-fn-type="conflict"><p>Heather Piwowar and Jason Priem are founders of Impactstory, a non-profit company which makes Unpaywall, oaDOI, and other tools to improve scholarly communication.</p></div>
+</div>
+ <div class="fn-group" data-jats-content-type="author-contributions">
+ <h3 class="heading">Author Contributions</h3>
+<div class="fn" id="contribution-1" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-1" data-jats-ref-type="contrib" data-jats-rid="author-1">Heather Piwowar</a>, <a class="xref xref-contrib" href="#author-2" data-jats-ref-type="contrib" data-jats-rid="author-2">Jason Priem</a> and <a class="xref xref-contrib" href="#author-9" data-jats-ref-type="contrib" data-jats-rid="author-9">Stefanie Haustein</a> conceived and designed the experiments, performed the experiments, analyzed the data, contributed reagents/materials/analysis tools, wrote the paper, prepared figures and/or tables, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-2" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-3" data-jats-ref-type="contrib" data-jats-rid="author-3">Vincent Larivière</a> conceived and designed the experiments, performed the experiments, analyzed the data, contributed reagents/materials/analysis tools, wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-3" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-4" data-jats-ref-type="contrib" data-jats-rid="author-4">Juan Pablo Alperin</a> conceived and designed the experiments, performed the experiments, analyzed the data, wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-4" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-5" data-jats-ref-type="contrib" data-jats-rid="author-5">Lisa Matthias</a> performed the experiments, analyzed the data, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-5" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-6" data-jats-ref-type="contrib" data-jats-rid="author-6">Bree Norlander</a> analyzed the data, wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-6" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-7" data-jats-ref-type="contrib" data-jats-rid="author-7">Ashley Farley</a> wrote the paper, reviewed drafts of the paper.</p></div>
+<div class="fn" id="contribution-7" data-jats-fn-type="con"><p><a class="xref xref-contrib" href="#author-8" data-jats-ref-type="contrib" data-jats-rid="author-8">Jevin West</a> reviewed drafts of the paper.</p></div>
+</div>
+ <div class="fn-group" data-jats-content-type="other">
+ <h3 class="heading">Data Availability</h3>
+<div class="fn" id="addinfo-1">
+<p>The following information was supplied regarding data availability:</p>
+ <p>Zenodo: <a class="ext-link" href="http://doi.org/10.5281/zenodo.837902" data-jats-ext-link-type="uri">http://doi.org/10.5281/zenodo.837902</a>.</p>
+ <p>The datasets behind the analysis in this paper are openly available at <a class="ext-link" href="http://dx.doi.org/10.5281/zenodo.837902" data-jats-ext-link-type="uri">http://dx.doi.org/10.5281/zenodo.837902</a> and the R statistics code can be found at <a class="ext-link" href="https://github.com/Impactstory/oadoi-paper1" data-jats-ext-link-type="uri">https://github.com/Impactstory/oadoi-paper1</a>. The oaDOI code is open source at <a class="ext-link" href="https://github.com/impactstory/oadoi" data-jats-ext-link-type="uri">https://github.com/impactstory/oadoi</a> and information about accessing the oaDOI API and full dataset is at <a class="ext-link" href="https://oadoi.org/api" data-jats-ext-link-type="uri">https://oadoi.org/api</a>.</p>
+</div>
+</div>
+ <h3 class="heading">Funding</h3>
+<p>The authors received no funding for this work.</p>
+</div>
+ <section class="ref-list-container" id="references"><h2 class="heading">References</h2>
+<ul class="ref-list" data-jats-content-type="authoryear">
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-1">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Anderson</span></span>.</b> <b class="year" itemprop="datePublished">2017a</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/">When the wolf finally arrives: big deal cancelations in North American Libraries</a>.</cite> <span> <span class="comment">The Scholarly Kitchen. <a class="uri" href="https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/">https://scholarlykitchen.sspnet.org/2017/05/01/wolf-finally-arrives-big-deal-cancelations-north-american-libraries/</a>
+ </span> <span class="access-date">(accessed <time class="date-in-citation" datetime="2018-01-09">09 January 2018</time>)</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-2">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Anderson</span></span>.</b> <b class="year" itemprop="datePublished">2017b</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/">The forbidden forecast: thinking about open access and library subscriptions</a>.</cite> <span> <span class="comment">The Scholarly Kitchen. <a class="uri" href="https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/">https://scholarlykitchen.sspnet.org/2017/02/21/forbidden-forecast-thinking-open-access-library-subscriptions/</a>
+ </span> <span class="access-date">(accessed <time class="date-in-citation" datetime="2017-07-15">15 July 2017</time>)</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-3">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Antelman</span> <span class="given-names" itemprop="givenName">K</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://www.ala.org/acrl/sites/ala.org.acrl/files/content/conferences/confsandpreconfs/2017/LeveragingtheGrowthofOpenAccess.pdf">Leveraging the growth of open access in library collection decision making</a>.</cite> In: <span itemprop="name"><a class="conf-name" target="_blank" href="https://scholar.google.com/scholar_lookup?title=Proceeding%20from%20ACRL%202017:%20at%20the%20helm:%20leading%20transformation&amp;author=&amp;publication_year=2017">Proceeding from ACRL 2017: at the helm: leading transformation</a>.</span><span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-4">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Archambault</span> <span class="given-names" itemprop="givenName">É</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Amyot</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Deschamps</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nicol</span> <span class="given-names" itemprop="givenName">A</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Provencher</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Rebout</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Roberge</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2013</b>.</span> <span class="article-title"> <span class="source">Proportion of open access peer-reviewed papers at the European and world levels–2004–2011</span>. </span><span class="institution">European Commission, Brussels</span> </div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-5">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Archambault</span> <span class="given-names" itemprop="givenName">É</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Amyot</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Deschamps</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nicol</span> <span class="given-names" itemprop="givenName">AF</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Provencher</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Rebout</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Roberge</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <span class="article-title"> <span class="source">Proportion of open access papers published in peer-reviewed journals at the European and world levels–1996–2013</span>. </span><span class="institution">European Commission</span> </div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-6">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Archambault</span> <span class="given-names" itemprop="givenName">É</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Côté</span> <span class="given-names" itemprop="givenName">G</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Struck</span> <span class="given-names" itemprop="givenName">B</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Voorons</span> <span class="given-names" itemprop="givenName">M</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://digitalcommons.unl.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&amp;httpsredir=1&amp;article=1028&amp;context=scholcom">Research impact of paywalled versus open access papers</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-7">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Berg</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/">Measuring the scientific output and impact of NIGMS grants</a>.</cite> <span> <span class="comment">NIGMS Feedback Loop Blog [Blog post]. <a class="uri" href="https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/">https://loop.nigms.nih.gov/2010/09/measuring-the-scientific-output-and-impact-of-nigms-grants/</a>
+ </span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-8">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">B</span></span>.</b> <b class="year" itemprop="datePublished">2016a</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1016%2Fj.joi.2016.08.002">Hybrid open access—a longitudinal study</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Informetrics</span></span> <b itemprop="volumeNumber">10</b></span>(<span itemprop="issueNumber">4</span>)</span>:<span class="fpage" itemprop="pageStart">919</span>-<span class="lpage" itemprop="pageEnd">932</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-9">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">B-C</span></span>.</b> <b class="year" itemprop="datePublished">2016b</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1002%2Fleap.1021">The open access movement at a crossroad: are the big publishers and academic social media taking over?</a></cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Learned Publishing</span></span> <b itemprop="volumeNumber">29</b></span>(<span itemprop="issueNumber">2</span>)</span>:<span class="fpage" itemprop="pageStart">131</span>-<span class="lpage" itemprop="pageEnd">134</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-10">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1002%2Fleap.1096">Gold, green, and black open access</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Learned Publishing</span></span> <b itemprop="volumeNumber">30</b></span>:<span class="fpage" itemprop="pageStart">173</span>-<span class="lpage" itemprop="pageEnd">175</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-11">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Welling</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Paetau</span> <span class="given-names" itemprop="givenName">P</span></span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=Anatomy%20of%20green%20open%20access&amp;author=Bj%C3%B6rk&amp;publication_year=2014">Anatomy of green open access</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the Association for Information Science and Technology</span></span> <b itemprop="volumeNumber">65</b></span>(<span itemprop="issueNumber">2</span>)</span>:<span class="fpage" itemprop="pageStart">237</span>-<span class="lpage" itemprop="pageEnd">250</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-12">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Welling</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Majlender</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hedlund</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Guðnason</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0011273">Open access to the scientific journal literature: situation 2009</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">5</b></span>(<span itemprop="issueNumber">6</span>)</span>:<span class="fpage" itemprop="pageStart">e11273</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-13">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Bohannon</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1126%2Fscience.352.6285.508">Who’s downloading pirated papers? Everyone</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Science</span></span> <b itemprop="volumeNumber">352</b></span>(<span itemprop="issueNumber">6285</span>)</span>:<span class="fpage" itemprop="pageStart">508</span>-<span class="lpage" itemprop="pageEnd">512</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-14">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Boudry</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chartron</span> <span class="given-names" itemprop="givenName">G</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1007%2Fs11192-016-2225-6">Availability of digital object identifiers in publications archived by PubMed</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Scientometrics March</span></span> <b itemprop="volumeNumber">110</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">1453</span>-<span class="lpage" itemprop="pageEnd">1469</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-15">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chawla</span> <span class="given-names" itemprop="givenName">D</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="http://www.sciencemag.org/news/2017/10/publishers-take-researchgate-court-alleging-massive-copyright-infringement">Publishers take ResearchGate to court, alleging massive copyright infringement</a>.</cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Science News</span></span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-16">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chen</span> <span class="given-names" itemprop="givenName">X</span></span>.</b> <b class="year" itemprop="datePublished">2013</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1080%2F19322909.2013.795426">Journal article retrieval in an age of Open Access: how journal indexes indicate Open Access articles</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Web Librarianship</span></span> <b itemprop="volumeNumber">7</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">243</span>-<span class="lpage" itemprop="pageEnd">254</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-17">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Chen</span> <span class="given-names" itemprop="givenName">X</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Olijhoek</span> <span class="given-names" itemprop="givenName">T</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1080%2F00987913.2016.1182672">Measuring the degrees of openness of scholarly journals with the open access spectrum (OAS) evaluation tool</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Serials Review</span></span> <b itemprop="volumeNumber">42</b></span>(<span itemprop="issueNumber">2</span>)</span>:<span class="fpage" itemprop="pageStart">108</span>-<span class="lpage" itemprop="pageEnd">115</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-18">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Craig</span> <span class="given-names" itemprop="givenName">ID</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Plume</span> <span class="given-names" itemprop="givenName">AM</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McVeigh</span> <span class="given-names" itemprop="givenName">ME</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Pringle</span> <span class="given-names" itemprop="givenName">J</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Amin</span> <span class="given-names" itemprop="givenName">M</span></span>.</b> <b class="year" itemprop="datePublished">2007</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1016%2Fj.joi.2007.04.001">Do open access articles have greater citation impact?</a></cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Informetrics</span></span> <b itemprop="volumeNumber">1</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">239</span>-<span class="lpage" itemprop="pageEnd">248</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-19">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">Creative Commons</span>.</b> <b class="year" itemprop="datePublished">2018</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://creativecommons.org/licenses/by/4.0/">Attribution 4.0 International (CC BY 4.0)</a></cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-20">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Davis</span> <span class="given-names" itemprop="givenName">PM</span></span>.</b> <b class="year" itemprop="datePublished">2011</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1096%2Ffj.11-183988">Open access, readership, citations: a randomized controlled trial of scientific journal publishing</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">FASEB Journal</span></span> <b itemprop="volumeNumber">25</b></span>:<span class="fpage" itemprop="pageStart">2129</span>-<span class="lpage" itemprop="pageEnd">2134</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-21">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Davis</span> <span class="given-names" itemprop="givenName">PM</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Walters</span> <span class="given-names" itemprop="givenName">WH</span></span>.</b> <b class="year" itemprop="datePublished">2011</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.3163%2F1536-5050.99.3.008">The impact of free access to the scientific literature: a review of recent research</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the Medical Library Association</span></span> <b itemprop="volumeNumber">99</b></span>:<span class="fpage" itemprop="pageStart">208</span>-<span class="lpage" itemprop="pageEnd">217</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-22">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Fortney</span> <span class="given-names" itemprop="givenName">K</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gonder</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2015</b>.</span> <span class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://osc.universityofcalifornia.edu/2015/12/a-social-networking-site-is-not-an-open-access-repository/index.html">A social networking site is not an open access repository</a>. <span class="source">Office of Scholarly Communication</span>. </span><span class="institution">University of California</span> </div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-23">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gargouri</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hajjem</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Larivière</span> <span class="given-names" itemprop="givenName">V</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Carr</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Brody</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0013636">Self-selected or mandated, open access increases citation impact for higher quality research</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">5</b></span>(<span itemprop="issueNumber">10</span>)</span>:<span class="fpage" itemprop="pageStart">e13636</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-24">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gargouri</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Larivière</span> <span class="given-names" itemprop="givenName">V</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Carr</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>.</b> <b class="year" itemprop="datePublished">2012</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://arxiv.org/abs/1206.3664">Green and gold open access percentages and growth, by discipline</a>.</cite> <span class="label label-working-paper">preprint</span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-25">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gorraiz</span> <span class="given-names" itemprop="givenName">J</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Melero-Fuentes</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gumpenbergera</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Valderrama-Zuriánc</span> <span class="given-names" itemprop="givenName">J-C</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1016%2Fj.joi.2015.11.008">Availability of digital object identifiers (DOIs) in web of science and scopus</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Informetrics</span></span> <b itemprop="volumeNumber">10</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">98</span>-<span class="lpage" itemprop="pageEnd">109</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-26">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Greshake</span> <span class="given-names" itemprop="givenName">B</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.12688%2Ff1000research.11366.1">Looking into Pandora’s Box: the content of <i>Sci-Hub</i> and its usage [version 1; referees: 2 approved, 2 approved with reservations]</a></cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">F1000Research</span></span> <b itemprop="volumeNumber">6</b></span> <span class="comment">Article 541</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-27">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hajjem</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>.</b> <b class="year" itemprop="datePublished">2006</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://arxiv.org/abs/cs/0606079">Ten-year cross-disciplinary comparison of the growth of open access and how it increases research citation impact</a>.</cite> <span class="label label-working-paper">preprint</span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-28">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Harnad</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Brody</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Vallières</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Carr</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hitchcock</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Gingras</span> <span class="given-names" itemprop="givenName">Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Oppenheim</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hajjem</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hilf</span> <span class="given-names" itemprop="givenName">ER</span></span>.</b> <b class="year" itemprop="datePublished">2008</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1080%2F00987913.2008.10765150">The access/impact problem and the green and gold roads to open access: an update</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Serials Review</span></span> <b itemprop="volumeNumber">34</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">36</span>-<span class="lpage" itemprop="pageEnd">40</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-29">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Himmelstein</span> <span class="given-names" itemprop="givenName">DS</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Romero</span> <span class="given-names" itemprop="givenName">AR</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McLaughlin</span> <span class="given-names" itemprop="givenName">SR</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Tzovaras</span> <span class="given-names" itemprop="givenName">BG</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Greene</span> <span class="given-names" itemprop="givenName">CS</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.7287%2Fpeerj.preprints.3100v1">Sci-Hub provides access to nearly all scholarly literature (No. e3100v1)</a></cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PeerJ Preprints</span></span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-30">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Jamali</span> <span class="given-names" itemprop="givenName">HR</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1007%2Fs11192-017-2291-4">Copyright compliance and infringement in ResearchGate full-text journal articles</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Scientometrics</span></span> <b itemprop="volumeNumber">112</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">241</span>-<span class="lpage" itemprop="pageEnd">254</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-31">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>.</b> <b class="year" itemprop="datePublished">2012</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1186%2F1741-7015-10-124">Anatomy of open access publishing: a study of longitudinal development and internal structure</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">BMC Medicine</span></span> <b itemprop="volumeNumber">10</b></span> <span class="comment">Article 124</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-32">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">B</span></span>.</b> <b class="year" itemprop="datePublished">2013</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1002%2Fasi.22856">Delayed open access: an overlooked high-impact category of openly available scientific literature</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the American Society for Information Science and Technology</span></span> <b itemprop="volumeNumber">64</b></span>(<span itemprop="issueNumber">7</span>)</span>:<span class="fpage" itemprop="pageStart">1323</span>-<span class="lpage" itemprop="pageEnd">1329</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-33">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Laakso</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Welling</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Bukvova</span> <span class="given-names" itemprop="givenName">H</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nyman</span> <span class="given-names" itemprop="givenName">L</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Björk</span> <span class="given-names" itemprop="givenName">BC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hedlund</span> <span class="given-names" itemprop="givenName">T</span></span>.</b> <b class="year" itemprop="datePublished">2011</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0020961">The development of open access journal publishing from 1993 to 2009</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">6</b></span>(<span itemprop="issueNumber">6</span>)</span>:<span class="fpage" itemprop="pageStart">e20961</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-34">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Matsubayashi</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kurata</span> <span class="given-names" itemprop="givenName">K</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Sakai Y</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Morioka</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kato</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Morioka</span> <span class="given-names" itemprop="givenName">T</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kato</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mine</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ueda</span> <span class="given-names" itemprop="givenName">S</span></span>.</b> <b class="year" itemprop="datePublished">2009</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=Status%20of%20open%20access%20in%20the%20biomedical%20field%20in%202005&amp;author=Matsubayashi&amp;publication_year=2009">Status of open access in the biomedical field in 2005</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of the Medical Library Association</span></span> <b itemprop="volumeNumber">97</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">4</span>-<span class="lpage" itemprop="pageEnd">11</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-35">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McCabe</span> <span class="given-names" itemprop="givenName">M</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Snyder</span> <span class="given-names" itemprop="givenName">C</span></span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1111%2Fecin.12064">Identifying the effect of open access on citations using a panel of science journals</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Economic Inquiry</span></span> <b itemprop="volumeNumber">52</b></span>(<span itemprop="issueNumber">4</span>)</span>:<span class="fpage" itemprop="pageStart">1284</span>-<span class="lpage" itemprop="pageEnd">1300</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-36">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McKiernan</span> <span class="given-names" itemprop="givenName">E</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Bourne</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Brown</span> <span class="given-names" itemprop="givenName">C</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Buck</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Kenall</span> <span class="given-names" itemprop="givenName">A</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Lin</span> <span class="given-names" itemprop="givenName">J</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">McDougall</span> <span class="given-names" itemprop="givenName">D</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Nosek</span> <span class="given-names" itemprop="givenName">BA</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ram</span> <span class="given-names" itemprop="givenName">K</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Soderberg</span> <span class="given-names" itemprop="givenName">CK</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName"> Spies</span> <span class="given-names" itemprop="givenName"> JR</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Updegrove</span> <span class="given-names" itemprop="givenName">A</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Woo</span> <span class="given-names" itemprop="givenName">KH</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Yarkoni</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Rodgers</span> <span class="given-names" itemprop="givenName">P</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.7554%2FeLife.16800">How open science helps researchers succeed</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">eLife</span></span> <b itemprop="volumeNumber">5</b></span>:<span class="elocation-id" itemprop="pageStart">e16800</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-37">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mongeon</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Paul-Hus</span> <span class="given-names" itemprop="givenName">A</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1007%2Fs11192-015-1765-5">The journal coverage of Web of Science and Scopus: a comparative analysis</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Scientometrics</span></span> <b itemprop="volumeNumber">106</b></span>(<span itemprop="issueNumber">1</span>)</span>:<span class="fpage" itemprop="pageStart">213</span>-<span class="lpage" itemprop="pageEnd">228</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-38">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ottaviani</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1371%2Fjournal.pone.0159614">The post-embargo open access citation advantage: it exists (probably), it’s modest (usually), and the rich get richer (of course)</a></cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">PLOS ONE</span></span> <b itemprop="volumeNumber">11</b></span>(<span itemprop="issueNumber">8</span>)</span>:<span class="fpage" itemprop="pageStart">e0159614</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-39">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Packer</span> <span class="given-names" itemprop="givenName">AL</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=The%20SciELO%20open%20access:%20a%20gold%20way%20from%20the%20south&amp;author=Packer&amp;publication_year=2010">The SciELO open access: a gold way from the south</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Canadian Journal of Higher Education</span></span> <b itemprop="volumeNumber">39</b></span>(<span itemprop="issueNumber">3</span>)</span>:<span class="fpage" itemprop="pageStart">111</span>-<span class="lpage" itemprop="pageEnd">126</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-40">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">PLOS</span>.</b> <b class="year" itemprop="datePublished">2018</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://journals.plos.org/plosone/s/reviewer-guidelines#loc-criteria-for-publication">Reviewer guidelines: criteria for publication</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-41">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Schiermeier</span> <span class="given-names" itemprop="givenName">Q</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mega</span> <span class="given-names" itemprop="givenName">ER</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1038%2Fnature.2016.21223">Scientists in Germany, Peru and Taiwan to lose access to Elsevier journals</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Nature News</span></span> <b itemprop="volumeNumber">541</b></span>(<span itemprop="issueNumber">7635</span>)</span>:<span class="fpage" itemprop="pageStart">13</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-42">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Smith</span> <span class="given-names" itemprop="givenName">E</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Haustein</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Mongeon</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Fei</span> <span class="given-names" itemprop="givenName">S</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Ridde</span> <span class="given-names" itemprop="givenName">V</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Larivière</span> <span class="given-names" itemprop="givenName">V</span></span>.</b></span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=Knowledge%20sharing%20in%20global%20health%20research;%20the%20impact,%20uptake%20and%20cost%20of%20open%20access%20to%20scholarly%20literature&amp;author=Smith&amp;publication_year=">Knowledge sharing in global health research; the impact, uptake and cost of open access to scholarly literature</a>.</cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">BMC Health Research Policy and System</span></span> <span class="comment">In Press</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-43">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">SPARC Europe</span>.</b> <b class="year" itemprop="datePublished">2015</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://sparceurope.org/what-we-do/open-access/sparc-europe-open-access-resources/open-access-citation-advantage-service-oaca/oaca-list/">The open access citation advantage: list of studies until 2015</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-44">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Suber</span> <span class="given-names" itemprop="givenName">P</span></span>.</b> <b class="year" itemprop="datePublished">2008</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://dash.harvard.edu/handle/1/4322580">Gratis and libre open access</a>.</cite> <span><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">SPARC Open Access Newsletter, 124</span></span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-45">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Tennant</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://www.scienceopen.com/search#%7B%22order%22%3A0%2C%22context%22%3A%7B%22collection%22%3A%7B%22id%22%3A%22996823e0-8104-4490-b26a-f2f733f810fb%22%2C%22kind%22%3A0%7D%2C%22kind%22%3A11%7D%2C%22kind%22%3A77%7D">The open access citation advantage</a>.</cite> <span> <span class="access-date">(accessed <time class="date-in-citation" datetime="2017-08-02">2 August 2017</time>)</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-46">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Tennant</span> <span class="given-names" itemprop="givenName">JP</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Waldner</span> <span class="given-names" itemprop="givenName">F</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Jacques</span> <span class="given-names" itemprop="givenName">DC</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Masuzzo</span> <span class="given-names" itemprop="givenName">P</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Collister</span> <span class="given-names" itemprop="givenName">LB</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Hartgerink</span> <span class="given-names" itemprop="givenName">CH</span></span>.</b> <b class="year" itemprop="datePublished">2016</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.12688%2Ff1000research.8460.3">The academic, economic and societal impacts of Open Access: an evidence-based review (version 3; referees: 3 approved, 2 approved with reservations)</a></cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">F1000 Research</span></span> <b itemprop="volumeNumber">5</b></span> <span class="comment">Article 632</span></span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-47">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">Universitat Konstanz</span>.</b> <b class="year" itemprop="datePublished">2014</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="https://www.uni-konstanz.de/universitaet/aktuelles-und-medien/aktuelle-meldungen/aktuelles/aktuelles/teurer-als-die-wissenschaft-erlaubt/">Teurer als die Wissenschaft erlaubt</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" id="ref-48">
+<span class="citation-authors-year"><b><span class="collab" itemprop="author" itemscope="itemscope">Université de Montréal</span>.</b> <b class="year" itemprop="datePublished">2017</b>.</span> <cite class="article-title"><a class="article-title" target="_blank" itemprop="url" href="http://www.bib.umontreal.ca/communiques/20170504-DC-annulation-taylor-francis-va.htm">UdeM Libraries cancel Big Deal subscription to 2231 periodical titles published by Taylor &amp; Francis Group</a>.</cite> <span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-49">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Wagner</span> <span class="given-names" itemprop="givenName">AB</span></span>.</b> <b class="year" itemprop="datePublished">2010</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.5062%2FF4Q81B0W">Open access citation advantage: an annotated bibliography</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Issues in Science and Technology Librarianship</span></span> <b itemprop="volumeNumber">60</b></span>:<span class="fpage" itemprop="pageStart">2</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-50">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Walker</span> <span class="given-names" itemprop="givenName">TJ</span></span>, <span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Soichi</span> <span class="given-names" itemprop="givenName">transl. T</span></span>.</b> <b class="year" itemprop="datePublished">1998</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://doi.org/10.1241%2Fjohokanri.41.678">Free internet access to traditional journals</a>.</cite> <span><span class="issue" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationIssue"><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Information Processing and Management</span></span> <b itemprop="volumeNumber">41</b></span>(<span itemprop="issueNumber">9</span>)</span>:<span class="fpage" itemprop="pageStart">678</span>-<span class="lpage" itemprop="pageEnd">694</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/ScholarlyArticle" id="ref-51">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Willinsky</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2003</b>.</span> <cite itemprop="name"><a class="article-title" target="_blank" itemprop="url" href="https://scholar.google.com/scholar_lookup?title=The%20nine%20flavours%20of%20open%20access%20scholarly%20publishing&amp;author=Willinsky&amp;publication_year=2003">The nine flavours of open access scholarly publishing</a>.</cite> <span><span class="volume" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/PublicationVolume"><span class="source" itemprop="isPartOf" itemscope="itemscope" itemtype="http://schema.org/Periodical"><span itemprop="name">Journal of Postgraduate Medicine</span></span> <b itemprop="volumeNumber">49</b></span>:<span class="fpage" itemprop="pageStart">263</span>-<span class="lpage" itemprop="pageEnd">267</span> </span>
+</div></li>
+<li class="ref"><div class="citation" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/Book" id="ref-52">
+<span class="citation-authors-year"><b><span class="name" itemprop="author" itemscope="itemscope" itemtype="http://schema.org/Person"><span class="surname" itemprop="familyName">Willinsky</span> <span class="given-names" itemprop="givenName">J</span></span>.</b> <b class="year" itemprop="datePublished">2009</b>.</span> <cite class="article-title"></cite> <span itemprop="name"><a class="source" target="_blank" href="https://scholar.google.com/scholar_lookup?title=The%20access%20principle:%20the%20case%20for%20open%20access%20to%20research%20and%20scholarship&amp;author=&amp;publication_year=2009">The access principle: the case for open access to research and scholarship</a></span><span> (<span class="edition">1 edition</span>). Cambridge: <span class="publisher">MIT Press</span>. </span>
+</div></li>
+</ul></section>
+ </footer></article>
+ </div>
+
+
+ <div id="related-research"></div>
+
+ <!-- annotations -->
+ <ul class="nav nav-tabs annotation-tabs-nav">
+ <li class="active"><a href="#questions" data-toggle="tab"><i class="icon-comments"></i> Questions
+ <span class="annotation-counter annotation-counter-questioning"></span></a></li>
+ <li><a href="#links" data-toggle="tab"><i class="icon-link"></i> Links
+ <span class="annotation-counter annotation-counter-linking"></span></a></li>
+ </ul>
+
+ <div class="tab-content annotation-tab-content">
+ <div class="tab-pane active" id="questions">
+ <div class="annotations" id="questions" data-target="articles/4375" data-counts="1">
+ <div class="row-fluid row-article-item-section">
+ <div class="span1 article-main-left-span1">&nbsp;</div>
+ <div class="span11 article-item-section-content">
+
+ <div>
+ <a rel="nofollow" class="annotation-loader"
+ href="/questions/index.html?target=articles/4375&amp;_sort=score">Questions</a>
+ </div>
+
+ <a class="btn btn-primary annotation-create-button add-annotation"
+ id="annotation-create-question"
+ data-toggle="annotation-form"
+ data-target="#annotation-question-create-container"
+ rel="nofollow"
+ href="/questions.form?format=html&amp;target=articles/4375&amp;_counts=1"><i class="icon-plus"></i> Ask a question</a>
+ <div class="help-block annotation-learn-more"><a href="/about/FAQ/academic-contribution/" target="_blank">Learn more about Q&amp;A</a></div>
+ <div class="annotation-form-container"
+ id="annotation-question-create-container"></div>
+ </div>
+ </div>
+</div>
+ </div>
+
+ <div class="tab-pane" id="links">
+ <div class="annotations" id="links" data-target="articles/4375" data-counts="1">
+ <div class="row-fluid row-article-item-section">
+ <div class="span1 article-main-left-span1">&nbsp;</div>
+ <div class="span11 article-item-section-content">
+
+ <div>
+ <a rel="nofollow" class="annotation-loader"
+ href="/links/index.html?target=articles/4375&amp;_sort=score">Links</a>
+ </div>
+
+ <a class="btn btn-primary annotation-create-button add-annotation"
+ id="annotation-create-link"
+ data-toggle="annotation-form"
+ data-target="#annotation-link-create-container"
+ rel="nofollow"
+ href="/links.form?format=html&amp;target=articles/4375&amp;_counts=1"><i class="icon-plus"></i> Add a link</a>
+ <div class="annotation-form-container"
+ id="annotation-link-create-container"></div>
+ </div>
+ </div>
+</div>
+ </div>
+ </div>
+
+ <div class="hidden-desktop" id="mobile-featured-jobs"></div>
+ </div>
+
+ <!-- Right sidebar -->
+ <div class="span3 offset1 article-sidebar visible-desktop">
+ <div id="article-sidebar-main-content" data-todo-href="/todos/19698/">
+ <div class="dimensions-stats-container">
+ <span class="__dimensions_badge_embed__" data-doi="10.7717/peerj.4375" data-hide-zero-citations="true" data-legend="always" data-style="small_circle"></span>
+ </div>
+
+
+ <div class="row-fluid item-action-buttons article-sidebar-item">
+ <div class="span12">
+ <a href="/benefits/" class="author-quote article-author-quote-link">
+ <div class="author-quote-text">
+ <span class="lead-in">I published in PeerJ</span> and it is very fast, has good editors, has consistently given good quality and rigorous reviews of my work, and produces visually appealing manuscripts.</div>
+ <div class="author-quote-details">
+ <span class="author-quote-name">Matthew Jackson</span><br>
+ PeerJ author
+ </div>
+</a> <div class="article-free-publishing-cta">
+ <div class="article-free-publishing-cta-title">Publish Free in 2020</div>
+ <div class="article-free-publishing-cta-subline">In PeerJ Chemistry Journals</div>
+ <a href="https://peerj.com/blog/post/115284881305/free-open-access-publishing-for-chemistry-and-computer-science-subject-areas" class="btn btn-article article-free-publishing-cta-btn">
+ Learn more
+ </a>
+ </div>
+ <div id="download-modal-trigger" class="js-download-modal-trigger btn btn-article btn-download btn-success mb-3 ">
+ Download
+</div> <!--<div class="content-cta-intro-text">Want alerts from articles like this?</div>-->
+<div id="content-alert-link" class="content-alert-link-btn" data-href="/content-alert/?aid=19698">
+ <div id="content-alert-button-label">
+ <i class="icon-envelope btn-content-alert-icon"></i>
+ Content <div class="content-alert-btn-lastword">Alert</div>
+ </div>
+ <div id="content-alert-button-loading" style="display:none;"><i class="icon-spin icon-spinner"></i> Loading...</div>
+</div>
+ <div class="content-cta-help-text">
+ Just enter your email
+ </div>
+ </div>
+ </div>
+
+
+
+
+ <nav class="article-sidebar-block">
+ <div class="sidebar-heading">
+ <i class="icon-wrench"></i> Tools & info
+ </div>
+ <ul class="nav nav-list article-item-metrics-counts" data-src="/articles/4375/counter/">
+ <li>
+ <a href="/articles/4375/reviews/"
+ rel="version-history">Peer Review history</a>
+ </li>
+
+
+ <li><a href="/articles/4375/citations/" data-toggle="modal" data-target="#citing-modal">See citing articles <span class="metric-counter citation-item-count">203</span></a></li>
+
+
+ <li><a href="#questions">Ask questions
+ <span class="metric-counter annotation-counter-questioning"></span></a></li>
+
+ <li><a href="#links">Add links
+ <span class="metric-counter annotation-counter-linking"></span></a></li>
+
+ <li class="article-item-metrics-count"><a data-toggle="modal" href="#metricsModal">Visitors <span class="metric-counter" data-count="visitors">&nbsp;</span> <span class="pull-right metric-counter-details-cta">click for details</span></a></li>
+ <li class="article-item-metrics-count"><a data-toggle="modal" href="#metricsModal">Views <span class="metric-counter" data-count="views-html">&nbsp;</span></a></li>
+ <li class="article-item-metrics-count"><a data-toggle="modal" href="#metricsModal">Downloads <span class="metric-counter" data-count="views-pdf">&nbsp;</span></a></li>
+
+ <li><a id="item-flag-button" data-toggle="modal" href="#flagModal">Report problem with article</a></li>
+ </ul>
+ </nav>
+
+
+ <div id="related-research-sidebar"></div>
+
+</div>
+<nav class="article-sidebar-block follow" >
+ <div class="sidebar-heading">
+ <i class="icon-list-ul"></i> Outline
+ </div>
+ <div class="article-navigation"></div>
+ <div id="top-return" class="top-return">
+ <i class="icon-arrow-up"></i> Return to top
+ </div>
+
+ <div data-clone="#expertrxiv-related" data-source="/expertrxiv/related/?subjectIds=85%2C87%2C111&amp;subjects=Legal%20Issues%2C%20Science%20Policy%2C%20Data%20Science"></div>
+
+ </nav>
+
+<div class="subjects-navigation"></div>
+
+ <div id="article-identifiers">
+ <span class="article-meta-name">PubMed</span>
+ <a href="https://www.ncbi.nlm.nih.gov/pubmed/29456894"
+ id="article-identifier-pmid" target="_blank">29456894</a>
+ </div>
+ </div>
+ </div>
+
+
+<style>
+ .modal-loading-container{
+ display:flex;
+ justify-content:center;
+ color:#999;
+ padding:3rem;
+ }
+</style>
+
+<div id="download-article-modal" class="modal hide fade peer-review-article" style="">
+
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3>Download article</h3>
+ </div>
+
+ <div class="modal-body">
+ <div id="download-article-modal-loading" class="modal-loading-container" style="display:none;">
+ <i class="icon-spin icon-3x icon-spinner"></i>
+ </div>
+ <div id="download-article-modal-body">
+ <div id="download-modal-buttons-container">
+ <div class="download-modal-article-title">The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles</div>
+ <div class="mt-2 download-buttons">
+ <a target="_blank" download data-format="PDF" data-download-confirm-text="PDF downloaded" href="https://peerj.com/articles/4375.pdf" target="_blank" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> PDF (2.3MB)</a>
+ <a target="_blank" data-download-confirm-text="Mendeley opened" href="http://www.mendeley.com/import/?doi=10.7717/peerj.4375" class="btn btn-primary js-download-btn btn-block btn-large mb-2"><i class="icon-cloud-download mr-1"></i> Save to Mendeley</a>
+ <a target="_blank" data-download-confirm-text="Readcube article opened" href="http://www.readcube.com/articles/10.7717/peerj.4375" class="btn btn-primary js-download-btn btn-block btn-large mb-2"><i class="icon-cloud-download mr-1"></i> Read in ReadCube</a>
+ <a target="_blank" data-format="RIS" data-download-confirm-text="RIS file downloaded" href="https://peerj.com/articles/4375.ris" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> RIS</a>
+ <a target="_blank" data-format="XML" data-download-confirm-text="XML file downloaded" href="https://peerj.com/articles/4375.xml" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> XML</a>
+ <a target="_blank" data-format="BibText" data-download-confirm-text="BibText file downloaded" href="https://peerj.com/articles/4375.bib" class="btn btn-primary js-download-btn btn-block btn-large mb-2 "><i class="icon-cloud-download mr-1"></i> BibTeX</a>
+
+ </div>
+ </div>
+
+ <div id="download-modal-downloading-message" style="display:none;">
+ <div class="text-center pt-4 pb-4">
+ <div>
+ <strong>Your download will start in a moment...</strong>
+ </div>
+ <div class="btn btn-secondary mt-4 js-close-download-modal">Close</div>
+ </div>
+ </div>
+
+ <div id="download-modal-signup-container" style="display:none;">
+
+<div class="download-modal-cta-container">
+
+ <div class="download-modal-confirm">
+ <div class="download-modal-confirm-title">
+ <i class="icon-tickcircle downloaded-tick"></i> <span class="download-modal-confirm-title-text"></span>
+ <i class="icon-chevron-down show-download-link"></i>
+ </div>
+ <a class="article-modal-download-url" href=""></a>
+ </div>
+
+
+ <div class="download-modal-cta-subtitle-small mt-2 mb-4 text-center">
+ Subscribe for subject updates
+ </div>
+
+ <div class="section-subscribe-container mb-2" style="display: flex;justify-content:center;">
+ <div>
+ <input type="text" placeholder="Email address" name="email" value="" class="form-control" id="download-subscribe-email">
+ </div>
+ <div class="ml-1">
+ <select name="freq" class="form-control" style="width: 100%;" id="download-subscribe-freq">
+ <option value="daily">Daily</option>
+ <option value="weekly">Weekly</option>
+ </select>
+ </div>
+ </div>
+
+ <div id="download-subscribe-error-container" class="mb-2 text-center text-error" style="display:none;"></div>
+
+
+ <button class="btn btn-primary btn-block btn-large mb-2 btn-modal-cta"
+ style="display: block;"
+ id="download-subscribe-submit"
+ data-url="/content-alert/download-subscribe?aid=19698"
+ data-signed-in=""
+ data-section-name="">
+ Subscribe
+ </button>
+
+ <a href="#" class="btn btn-block btn-link btn-large btn-modal-close js-close-download-modal mb-2">
+ Close
+ </a>
+
+</div>
+
+<script>
+ (function(){
+ $('#download-subscribe-submit').click(function(){
+
+ var button = $(this);
+ var url = button.data('url');
+ if(button.attr('disabled')) return;
+
+ $.get(url, function(response){
+
+ if(!response.token){
+ errorContainer.html('Server error, you have not been subscribed').show();
+ button.html('Subscribe').removeAttr('disabled');
+ return;
+ }
+
+ var errorContainer = $('#download-subscribe-error-container');
+ errorContainer.html('').hide();
+ button.html('<i class="icon-spin icon-spinner"></i>').attr('disabled', true);
+
+ var signedIn = button.data('signed-in');
+ var sectionName = button.data('section-name');
+ var data = {
+ _token: response.token
+ };
+
+ if(!signedIn) {
+ var email = $('#download-subscribe-email').val();
+ data.email = email;
+ data.freq = $('download-subscribe-freq').val();
+ }
+
+ $.ajax({
+ url: url,
+ method: 'POST',
+ data: data
+ }).success(function(response){
+ button.hide();
+ $('.js-close-download-modal').trigger('click');
+
+ PeerJ.Tools.ToastNotifications.add({
+ type: 'success',
+ title: 'Subscribed',
+ text: sectionName ? 'You subscribed to ' + sectionName : 'You subscribed to this article\'s subjects'
+ });
+
+ }).error(function(response){
+ if(response.responseJSON && response.responseJSON.errors){
+ errorContainer.html(response.responseJSON.errors[0]).show();
+ }
+ }).complete(function(){
+ button.html('Subscribe').removeAttr('disabled');
+ });
+
+ });
+ });
+
+ }());
+</script>
+ </div>
+ </div>
+ </div>
+
+ <div class="modal-footer" style="display:none;">
+ <div class="pull-right">
+ </div>
+
+ <span class="submit-copy submit-copy-btn btn cancel pull-left" id="modal-cancel" data-dismiss="modal">
+ Cancel
+ </span>
+ </div>
+</div>
+
+ <div id="ajax-form"></div>
+
+ <!-- Flag Modal -->
+ <div id="flagModal" class="modal hide" style="max-height:none">
+ <div class="modal-header" style="text-align: center">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3 class="slim">Report a problem</h3>
+ </div>
+
+ <form id="article-flag-form"
+ data-href="/issues/4375/flag/"
+ method="post">
+
+ <div class="modal-body" style="max-height:350px;overflow-y:auto">
+ <div class="alert alert-info">
+ <p><strong>Common use cases</strong><br>
+ Typos, corrections needed, missing information, abuse, etc
+ </p>
+
+ <p><strong>Our promise</strong><br>
+ PeerJ promises to address all issues as quickly and professionally as possible. We
+ thank you in advance for your patience and understanding.
+ </p>
+ </div>
+
+ <div id="flag-modal-result" style="margin-left:45px;">
+
+ <div>
+ <label><strong>Type of problem</strong></label>
+ <p>
+ <select id="moderation_flag_category" name="moderation_flag[category]" class="span4"><option value="typo">Typo</option><option value="metadata">Missing or incorrect metadata</option><option value="quality">Quality: PDF, figure, table, or data quality</option><option value="download">Download issues</option><option value="abuse">Abusive behavior</option><option value="misconduct">Research misconduct</option><option value="other">Other issue not listed above</option></select>
+
+ </p>
+ </div>
+ <div>
+ <label><strong>Details</strong> <i class="icon-large icon-question-sign" title="Please be as detailed as possible within the 500 character limit. Any details you provide will not be shown publicly." data-toggle="tooltip"></i></label>
+ <div>
+ <textarea id="moderation_flag_detail" name="moderation_flag[detail]" required="required" maxlength="500" class="span4" placeholder="Enter any details about this issue. Kept confidential with PeerJ staff." rows="5" data-counter-target="#flag-counter"></textarea>
+
+ <div style="margin:10px 0 0 0; color:#777777; float: left; display: block"><span id="flag-counter" class="label">500</span> characters remaining</div>
+ </div>
+ </div>
+
+ </div>
+
+ </div>
+ </form>
+ <div id="flag-modal-footer" class="modal-footer">
+ <button class="btn" data-dismiss="modal" aria-hidden="true">Cancel</button>
+ <input type="submit" class="btn btn-success save-flag-btn" value="Send report">
+ </div>
+</div>
+
+ <!-- Follow Publication Modal -->
+ <div id="followModal" class="modal hide" style="max-height:none">
+ <div class="modal-header" style="text-align:center">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3 class="slim" id="followModalLabel">Follow this publication for updates</h3>
+ </div>
+
+ <div>
+ <div class="modal-body" style="max-height:350px;overflow-y:auto">
+ <div class="row-fluid" style="margin-bottom: 15px">
+ <div class="span1">
+ <i class="icon-large icon-bullhorn"></i>
+ </div>
+ <div class="span11">
+ "Following" is like subscribing to any updates related to a publication.
+ These updates will appear in your home dashboard each time you visit PeerJ.
+ </div>
+ </div>
+
+ <div class="row-fluid">
+ <div class="span1">
+ <i class="icon-large icon-envelope"></i>
+ </div>
+ <div class="span11">
+ <p>
+ You can also choose to receive updates via daily or weekly email digests.
+ If you are following multiple publications then we will send you
+ no more than one email per day or week based on your preferences.
+ </p>
+ <p>
+ <em>Note: You are now also subscribed to the subject areas of this publication</em>
+ and will receive updates in the daily or weekly email digests if turned on.
+ You can <a href="/settings/details/">add specific subject areas</a> through your profile settings.
+ </p>
+ </div>
+ </div>
+
+ <hr>
+ <div id="follow-modal-result" style="margin-left:-40px;padding-top:7px;">
+ </div>
+
+ </div>
+
+ </div>
+
+ <div id="follow-modal-footer" class="modal-footer">
+ <button class="btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ </div>
+ </div>
+
+ <!-- Unfollow Publication Modal -->
+ <div id="unfollowModal" class="modal hide">
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h3>Change notification settings or unfollow</h3>
+ </div>
+
+ <form id="article-unfollow-form"
+ data-href="/follow/publication/4375/1/"
+ method="put" class="form-horizontal">
+
+
+ <div id="unfollow-form-load-result" class="modal-body" data-href="/follow/publication/4375/edit/" style="max-height:350px;overflow-y:auto">
+ <p>Loading ...</p>
+ </div>
+
+ </form>
+ <div class="modal-footer">
+ <button class="btn follow-close-btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ <input type="submit" class="btn btn-success update-follow-btn" value="Update">
+ </div>
+</div>
+
+ <!-- Metrics Modal -->
+ <div id="metricsModal" class="modal hide">
+ <div class="modal-body" style="max-height:330px;overflow-y:auto">
+
+ <div class="row-fluid">
+ <div class="span12">
+ <p class="leadh2">Usage since published - updated daily</p>
+ </div>
+ </div>
+
+ <div class="row-fluid">
+ <div class="span8">
+ <h3 style="margin-bottom:10px">Social referrals <small>unique visitors</small></h3>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">Twitter</div>
+ <div class="span3" style="text-align:right;min-height:0">1,515</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">Facebook</div>
+ <div class="span3" style="text-align:right;min-height:0">676</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">Reddit</div>
+ <div class="span3" style="text-align:right;min-height:0">15</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">LinkedIn</div>
+ <div class="span3" style="text-align:right;min-height:0">11</div>
+ </div>
+
+ <h3 style="margin:30px 0 10px 0">Top referrals <small>unique visitors</small></h3>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ From bookmark or typed URL
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">30,876</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Google search
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">5,439</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Twitter
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">1,515</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ From PeerJ Content Alert Emails
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">32</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Yahoo search
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">20</div>
+ </div>
+ <div class="row-fluid" style="font-size: 16px; color: #444; border-bottom: 1px solid #ccc; margin-bottom: 5px;">
+ <div class="span8" style="min-height:0">
+ Webmail
+ </div>
+ <div class="span3" style="text-align:right;min-height:0">3</div>
+ </div>
+ </div>
+
+ <div class="span4" style="overflow-x:hidden;">
+ <h3 style="margin-bottom:10px">Share this publication</h3>
+
+
+
+ <ul class="unstyled">
+ <li>
+ <a class="pj-socialism tw-soc" href="http://twitter.com/share?url&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F&amp;via&#x3D;thePeerJ&amp;text&#x3D;The&#x25;20State&#x25;20of&#x25;20OA&amp;related&#x3D;l_matthia&#x25;2Cbree_uw&#x25;2Cashleydfarley" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Twitter</a>
+ </li>
+ <li>
+ <a class="pj-socialism fb-soc" href="http://www.facebook.com/sharer.php?u&#x3D;https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Facebook</a>
+ </li>
+ <li>
+ <a class="pj-socialism em-soc" href="mailto:?Subject&#x3D;Relevant&#x25;20research&#x25;20paper&#x25;20in&#x25;20PeerJ&amp;Body&#x3D;The&#x25;20state&#x25;20of&#x25;20OA&#x25;3A&#x25;20a&#x25;20large-scale&#x25;20analysis&#x25;20of&#x25;20the&#x25;20prevalence&#x25;20and&#x25;20impact&#x25;20of&#x25;20Open&#x25;20Access&#x25;20articles&#x25;20https&#x25;3A&#x25;2F&#x25;2Fpeerj.com&#x25;2Farticles&#x25;2F4375&#x25;2F" target="_blank" onclick="window.open(this.href, 'popupwindow', 'width=500,height=500,scrollbars,resizable'); return false;">Email</a>
+ </li>
+</ul>
+ <h3 style="margin-bottom:10px;margin-top:10px">Metrics</h3>
+
+ <!-- Altmetric -->
+ <div class="altmetric-embed" data-badge-popover="right"
+ data-link-target="_blank" data-doi="10.7717/peerj.4375"></div>
+ </div>
+ </div>
+
+ </div>
+
+ <div class="modal-footer">
+ <button class="btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ </div>
+</div>
+
+ <!-- Wiki Modal -->
+
+ <!-- Links Modal -->
+ <div class="modal hide fade" id="article-links-modal">
+ <div class="modal-header">
+ <a rel="nofollow" data-dismiss="modal" aria-hidden="true" class="close">&times;</a>
+
+ <h3 class="modal-title">Links</h3>
+ </div>
+
+ <div class="modal-body"></div>
+
+ <div class="modal-footer">
+ <a rel="nofollow" href="/links.form?target=articles/4375" class="btn btn-primary">Add a link</a>
+ <button class="btn follow-close-btn" data-dismiss="modal" aria-hidden="true">Close</button>
+ </div>
+</div>
+
+ <!-- Citing Modal -->
+ <div id="citing-modal" class="modal hide">
+ <div class="modal-header">
+ <button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
+ <h2 class="slim"><i class="icon-copy"></i> Articles citing this paper</h2>
+ </div>
+ <div class="modal-body">Loading citing articles… <i class="icon icon-spinner icon-spin"></i></div>
+</div>
+
+ <!-- Graphical abstract modal -->
+
+ </div>
+
+
+ <div id="push"></div>
+ </div>
+
+ <footer id="footer">
+ <div class="foot">
+ <div class="container">
+
+ <div class="row">
+ <div class="span7">
+ <b>About us -</b> <a href="/about/" class="aboutLink" data-target="team">PeerJ team</a>
+ | <a href="/about/publications/" class="aboutLink" data-target="journals">Our publications</a> |
+ <a href="/benefits/">Benefits</a> | <a
+ href="/about/partnerships/" class="aboutLink" data-target="partnership">Partnerships</a> | <a
+ href="/about/endorsements/" class="aboutLink" data-target="endorsements">Endorsements</a>
+ <i class="icon-trophy"></i> <a href="/about/reviews/" class="aboutLink" data-target="reviews">Awards</a>
+ </div>
+ <div class="span5">
+ <b>Resources -</b> <a href="/about/FAQ/">FAQ</a> | <a
+ href="/about/careers/">Careers</a> | <a href="/about/press/">Press
+ room</a> | <a href="/about/terms/">Terms of use</a> | <a
+ href="/about/privacy/">Privacy</a> | <a
+ href="/about/contact/" class="aboutLink" data-target="contact">Contact</a>
+ </div>
+ <div class="span7">
+ <b>Academic boards -</b> <a href="/academic-boards/advisors/">Advisors</a> | <a
+ href="/academic-boards/editors/">Editors</a> |
+ <a href="/academic-boards/subjects/">Subject areas</a>
+ </div>
+ <div class="span5">
+ <b>Follow us -</b>
+ <a href="https://peerj.com/blog/">PeerJ blog</a> |
+ <a href="http://twitter.com/thePeerJ/" title="Follow on Twitter" data-toggle="tooltip">Twitter</a>
+ |
+ <a href="http://facebook.com/thePeerJ/" title="Follow on Facebook" data-toggle="tooltip">Facebook</a>
+ |
+ <a href="http://www.linkedin.com/company/peerj" title="Follow on LinkedIn" data-toggle="tooltip">LinkedIn</a>
+ |
+ <a href="https://www.instagram.com/thepeerj" title="Follow on Instagram" data-toggle="tooltip">Instagram</a>
+ |
+ <a href="http://www.pinterest.com/thepeerj/boards/" title="Follow on Pinterest" data-toggle="tooltip">Pinterest</a>
+ </div>
+ <div class="span7">
+ <b>Submission guides -</b>
+ <a href="/about/aims-and-scope"><em>PeerJ – Life and Environment</em></a> |
+ <a href="/about/aims-and-scope/cs"><em>PeerJ Computer Science</em></a> |
+ <a href="/about/aims-and-scope/chemistry"><em>PeerJ Chemistry</em></a>
+ </div>
+ <div class="span5">
+ <b>Spread the word</b> -
+ <a href="/spread-the-word/activities/">Activities</a> |
+ <a href="/spread-the-word/resources/">Resources</a>
+ </div>
+ <div class="span7">&nbsp;</div>
+ <div class="span5">
+ <b>PeerJ feeds <i class="icon-rss"></i> - </b>
+ <a href="/articles/index.atom" rel="alternate" title="Articles (Atom)" type="application/atom+xml">Atom</a> |
+ <a href="/articles/index.rss1">RSS 1.0</a> |
+ <a href="/articles/index.rss2">RSS 2.0</a> |
+ <a href="/articles/index.json">JSON</a>
+ <br>
+
+ <b>PeerJ Computer Science feeds <i class="icon-rss"></i> - </b>
+ <a href="/articles/index.atom?journal=cs" rel="alternate" title="PeerJ Computer Science articles (Atom)" type="application/atom+xml">Atom</a> |
+ <a href="/articles/index.rss1?journal=cs">RSS 1.0</a> |
+ <a href="/articles/index.rss2?journal=cs">RSS 2.0</a> |
+ <a href="/articles/index.json?journal=cs">JSON</a>
+ <br>
+ <b>Archives - </b>
+ <a href="/archives/" rel="archives"><em>PeerJ – Life and Environment</em></a> |
+ <a href="/archives/?journal=cs" rel="archives"><em>PeerJ Computer Science</em></a>
+ </div>
+
+</div>
+
+<div id="fb-root"></div>
+
+ <div class="row" style="margin-top:10px;font-size:12px">
+ <div class="span12" style="color:#888">
+
+ <div>
+ <span style="margin-right:7px"><span style="font-style:italic">PeerJ</span> ISSN: 2167-8359</span>
+ <span style="margin-right:7px"><span style="font-style:italic">PeerJ Comput. Sci.</span> ISSN: 2376-5992</span>
+ <span><span style="font-style:italic">PeerJ Preprints</span> ISSN: 2167-9843</span>
+ </div>
+ </div>
+</div>
+ </div>
+ </div>
+ </footer>
+
+ <div id="alerts" data-async-alerts="/alerts/"></div>
+
+ <script src="/js/8d39319-35fca22.js"></script>
+ <script src="https://cdn.peerj.com/webpack/runtime.bfc7ab93.js"></script><script src="https://cdn.peerj.com/webpack/0.7880a6b6.js"></script><script src="https://cdn.peerj.com/webpack/1.24ea793f.js"></script><script src="https://cdn.peerj.com/webpack/vue-bundle.9bf24d69.js"></script>
+
+
+ <script src="/js/5d3c493-193ec0b.js"></script>
+
+ <script src="/js/c1dacd9-f146d62.js"></script>
+ <!--[if gt IE 8]><!-->
+ <script src="/assets/js/highlight/highlight.pack.js"></script>
+
+ <script>
+ $(function () {
+ // syntax highlighting for code blocks
+ $("pre > code").each(function() {
+ var node = $(this);
+
+ var language;
+
+ // JATS >=1.1
+ language = node.data('jats-language');
+
+ if (!language) {
+ // JATS <1.1
+ language = node.data('jats-preformat-type');
+
+ // ignore default 'code' type
+ if (language === 'code') {
+ language = null;
+ }
+ }
+
+ if (language) {
+ node.addClass('language-' + language);
+ }
+
+ hljs.highlightBlock(this);
+ });
+ });
+ </script>
+ <!--<![endif]-->
+
+ <script>
+ //initialise the follow button
+ $(function() {
+ PeerJ.Event.Follow.init();
+ });
+
+ //Show citations modal if query param exists
+ var urlParams = new URLSearchParams(window.location.search);
+ if(urlParams.has('citations')){
+ $('#citing-modal').modal('show');
+ }
+
+ </script>
+
+
+<script type="text/x-mathjax-config">
+ MathJax.Hub.Config({
+ messageStyle: "none",
+ imageFont: null,
+ "CommonHTML": {
+ linebreaks: { automatic: true },
+ scale: 95
+ },
+ "HTML-CSS": {
+ linebreaks: { automatic: true },
+ scale: 90
+ },
+ menuSettings: {
+ zoom: "Click"
+ }
+ });
+
+ MathJax.Ajax.config.root = "/bundles/peerjmathjax/MathJax/";
+</script>
+
+<script src="/bundles/peerjmathjax/MathJax/MathJax.js?config=TeX-MML-AM_HTMLorMML,Safe&noContrib"></script>
+
+ <script defer src='https://js.trendmd.com/trendmd.min.js' data-trendmdconfig='{"journal_id":"52926","element":"#related-research"}'></script>
+ <script defer src='https://js.trendmd.com/trendmd.min.js' data-trendmdconfig='{"journal_id":"52926","element":"#related-research-sidebar"}'></script>
+ <script async src="https://badge.dimensions.ai/badge.js" charset="utf-8"></script>
+
+ <div id="content-alert-container"></div>
+
+ <div id="toast-container"></div>
+
+ <div id="vue-notifications"></div>
+
+ <div id="vue-confirm-modal"></div>
+
+ <script>
+ $(PeerJ.Home.Banner.init);
+ </script>
+
+ </body>
+</html>
diff --git a/python/tests/files/plos_one_article.html b/python/tests/files/plos_one_article.html
new file mode 100644
index 0000000..9abfe00
--- /dev/null
+++ b/python/tests/files/plos_one_article.html
@@ -0,0 +1,1707 @@
+
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml"
+ xmlns:dc="http://purl.org/dc/terms/"
+ xmlns:doi="http://dx.doi.org/"
+ lang="en" xml:lang="en"
+ itemscope itemtype="http://schema.org/Article"
+ class="no-js">
+
+
+
+<head prefix="og: http://ogp.me/ns#">
+ <title>Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody</title>
+
+
+
+
+
+
+
+<link rel="stylesheet" href="/plosone/resource/compiled/asset_KWXDCDFJFQKCXTNZJE7SIB7MT43CSDVH.css" />
+
+ <!-- allows for extra head tags -->
+
+
+<!-- hello -->
+<link rel="stylesheet" type="text/css"
+ href="https://fonts.googleapis.com/css?family=Open+Sans:400,400i,600">
+
+<link media="print" rel="stylesheet" type="text/css" href="/plosone/resource/css/print.css"/>
+ <script type="text/javascript">
+ var siteUrlPrefix = "/plosone/";
+ </script>
+<script src="/plosone/resource/compiled/asset_SC5JIUGEUPR4P4P6VBUINUVOVUSU3NRY.js"></script>
+
+ <link rel="shortcut icon" href="/plosone/resource/img/favicon.ico" type="image/x-icon"/>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+
+
+
+
+
+
+ <link rel="canonical" href="https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978" />
+ <meta name="description" content="Reticuloendotheliosis virus (REV) is the most frequent exogenous virus that contaminates attenuated vaccines. Therefore, it is extremely important to select REV-free specific-pathogen-free (SPF) chicken embryos. Generally, REV infection is assessed by detecting REV antibodies in SPF chickens. This present study seeks to evaluate REV infection by replacing serum antibody detection with yolk antibody detection. A cohort of 40 nineteen-week-old SPF chickens were artificially inoculated with REV, with 32 SPF chickens raised in another isolation environment served as a blank control. Eggs and serum from 23-week-old chickens were sampled, and yolks were diluted separately to ratios of 1:150, 1:200, 1:300 and 1:400, which were detected together with serum. We found that the yolk antibody detection findings at a dilution of 1:300 had the highest coincidence rate compared with that based on serum antibody measurements. At a dilution ratio of 1:300 for yolk antibody, 72 chickens were continuously observed for 10 weeks from 25- to 34-weeks-old. Our findings were based on serum antibody or yolk antibody detection, and the evaluation results were completely consistent. Therefore, all serum antibody-positive chickens were yolk antibody-positive, and vice versa. Accordingly, vaccine producers can estimate REV cleanliness in a poultry farm by sampling yolk antibody titers." />
+
+ <meta name="citation_abstract" content="Reticuloendotheliosis virus (REV) is the most frequent exogenous virus that contaminates attenuated vaccines. Therefore, it is extremely important to select REV-free specific-pathogen-free (SPF) chicken embryos. Generally, REV infection is assessed by detecting REV antibodies in SPF chickens. This present study seeks to evaluate REV infection by replacing serum antibody detection with yolk antibody detection. A cohort of 40 nineteen-week-old SPF chickens were artificially inoculated with REV, with 32 SPF chickens raised in another isolation environment served as a blank control. Eggs and serum from 23-week-old chickens were sampled, and yolks were diluted separately to ratios of 1:150, 1:200, 1:300 and 1:400, which were detected together with serum. We found that the yolk antibody detection findings at a dilution of 1:300 had the highest coincidence rate compared with that based on serum antibody measurements. At a dilution ratio of 1:300 for yolk antibody, 72 chickens were continuously observed for 10 weeks from 25- to 34-weeks-old. Our findings were based on serum antibody or yolk antibody detection, and the evaluation results were completely consistent. Therefore, all serum antibody-positive chickens were yolk antibody-positive, and vice versa. Accordingly, vaccine producers can estimate REV cleanliness in a poultry farm by sampling yolk antibody titers.">
+
+
+ <meta name="keywords" content="Chickens,Antibodies,Livestock,Attenuated vaccines,Enzyme-linked immunoassays,Poultry,Animal sexual behavior,Vaccines" />
+
+
+<meta name="citation_doi" content="10.1371/journal.pone.0213978"/>
+<meta name="citation_author" content="Yang Li"/>
+ <meta name="citation_author_institution" content="China Animal Health and Epidemiology Center, Qingdao, China"/>
+<meta name="citation_author" content="Tuanjie Wang"/>
+ <meta name="citation_author_institution" content="China Institute of Veterinary Drug Control, Beijing, China"/>
+<meta name="citation_author" content="Lin Wang"/>
+ <meta name="citation_author_institution" content="China Animal Health and Epidemiology Center, Qingdao, China"/>
+<meta name="citation_author" content="Mingjun Sun"/>
+ <meta name="citation_author_institution" content="China Animal Health and Epidemiology Center, Qingdao, China"/>
+<meta name="citation_author" content="Zhizhong Cui"/>
+ <meta name="citation_author_institution" content="College of Veterinary Medicine, Shandong Agricultural University, Taian, China"/>
+<meta name="citation_author" content="Shuang Chang"/>
+ <meta name="citation_author_institution" content="College of Veterinary Medicine, Shandong Agricultural University, Taian, China"/>
+<meta name="citation_author" content="Yongping Wu"/>
+ <meta name="citation_author_institution" content="College of Animal Sciences and Technology, Zhejiang A&F University, Hangzhou, China"/>
+<meta name="citation_author" content="Xiaodong Zhang"/>
+ <meta name="citation_author_institution" content="College of Animal Sciences and Technology, Zhejiang A&F University, Hangzhou, China"/>
+<meta name="citation_author" content="Xiaohui Yu"/>
+ <meta name="citation_author_institution" content="China Animal Health and Epidemiology Center, Qingdao, China"/>
+<meta name="citation_author" content="Tao Sun"/>
+ <meta name="citation_author_institution" content="Shandong Entry-exit Inspection and Quarantine Bureau, Qingdao, China"/>
+<meta name="citation_author" content="Peng Zhao"/>
+ <meta name="citation_author_institution" content="College of Veterinary Medicine, Shandong Agricultural University, Taian, China"/>
+
+<meta name="citation_title" content="Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"/>
+<meta itemprop="name" content="Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"/>
+<meta name="citation_journal_title" content="PLOS ONE"/>
+<meta name="citation_journal_abbrev" content="PLOS ONE"/>
+<meta name="citation_date" content="Apr 22, 2019"/>
+<meta name="citation_firstpage" content="e0213978"/>
+<meta name="citation_issue" content="4"/>
+<meta name="citation_volume" content="14"/>
+<meta name="citation_issn" content="1932-6203"/>
+<meta name="citation_publisher" content="Public Library of Science"/>
+
+ <meta name="citation_pdf_url" content="https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable">
+
+ <meta name="citation_article_type" content="Research Article">
+
+<meta name="dc.identifier" content="10.1371/journal.pone.0213978" />
+
+
+ <meta name="twitter:card" content="summary" />
+ <meta name="twitter:site" content="@plosone"/>
+ <meta name="twitter:title" content="Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody" />
+ <meta property="twitter:description" content="Reticuloendotheliosis virus (REV) is the most frequent exogenous virus that contaminates attenuated vaccines. Therefore, it is extremely important to select REV-free specific-pathogen-free (SPF) chicken embryos. Generally, REV infection is assessed by detecting REV antibodies in SPF chickens. This present study seeks to evaluate REV infection by replacing serum antibody detection with yolk antibody detection. A cohort of 40 nineteen-week-old SPF chickens were artificially inoculated with REV, with 32 SPF chickens raised in another isolation environment served as a blank control. Eggs and serum from 23-week-old chickens were sampled, and yolks were diluted separately to ratios of 1:150, 1:200, 1:300 and 1:400, which were detected together with serum. We found that the yolk antibody detection findings at a dilution of 1:300 had the highest coincidence rate compared with that based on serum antibody measurements. At a dilution ratio of 1:300 for yolk antibody, 72 chickens were continuously observed for 10 weeks from 25- to 34-weeks-old. Our findings were based on serum antibody or yolk antibody detection, and the evaluation results were completely consistent. Therefore, all serum antibody-positive chickens were yolk antibody-positive, and vice versa. Accordingly, vaccine producers can estimate REV cleanliness in a poultry farm by sampling yolk antibody titers." />
+ <meta property="twitter:image" content="https://journals.plos.org/plosone/article/figure/image?id=10.1371/journal.pone.0213978.t003&size=inline" />
+
+<meta property="og:type" content="article" />
+<meta property="og:url" content="https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978"/>
+<meta property="og:title" content="Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"/>
+<meta property="og:description" content="Reticuloendotheliosis virus (REV) is the most frequent exogenous virus that contaminates attenuated vaccines. Therefore, it is extremely important to select REV-free specific-pathogen-free (SPF) chicken embryos. Generally, REV infection is assessed by detecting REV antibodies in SPF chickens. This present study seeks to evaluate REV infection by replacing serum antibody detection with yolk antibody detection. A cohort of 40 nineteen-week-old SPF chickens were artificially inoculated with REV, with 32 SPF chickens raised in another isolation environment served as a blank control. Eggs and serum from 23-week-old chickens were sampled, and yolks were diluted separately to ratios of 1:150, 1:200, 1:300 and 1:400, which were detected together with serum. We found that the yolk antibody detection findings at a dilution of 1:300 had the highest coincidence rate compared with that based on serum antibody measurements. At a dilution ratio of 1:300 for yolk antibody, 72 chickens were continuously observed for 10 weeks from 25- to 34-weeks-old. Our findings were based on serum antibody or yolk antibody detection, and the evaluation results were completely consistent. Therefore, all serum antibody-positive chickens were yolk antibody-positive, and vice versa. Accordingly, vaccine producers can estimate REV cleanliness in a poultry farm by sampling yolk antibody titers."/>
+<meta property="og:image" content="https://journals.plos.org/plosone/article/figure/image?id=10.1371/journal.pone.0213978.t003&size=inline"/>
+
+<meta name="citation_reference" content="citation_title=Occurrence of reticuloendotheliosis in Chinese partridge;citation_author=Z. Cheng;citation_author=Y. Shi;citation_author=L. Zhan;citation_author=G. Zhu;citation_author=X Diao;citation_author=Z. Cui;citation_journal_title=J Vet Med Sci;citation_volume=69;citation_number=69;citation_issue=12;citation_first_page=1295;citation_last_page=1298;citation_publication_date=2007;"/>
+<meta name="citation_reference" content="citation_title=Simultaneous endemic infections with subgroup J avian leukosis virus and reticuloendotheliosis virus in commercial and local breeds of chickens;citation_author=Z. Cui;citation_author=S. Sun;citation_author=Z. Zhang;citation_author=S. Meng;citation_journal_title=Avian Pathol;citation_volume=38;citation_number=38;citation_issue=6;citation_first_page=443;citation_last_page=448;citation_publication_date=2009;"/>
+<meta name="citation_reference" content="citation_title=Serological survey of the Reticuloendotheliosis virus infection in China native chicken flocks;citation_author=P. Zhao;citation_author=C. Ma;citation_author=Y. Du;citation_author=Z. Cui;citation_journal_title=Pak Vet J;citation_volume=32;citation_number=32;citation_first_page=621;citation_last_page=623;citation_publication_date=2012;"/>
+<meta name="citation_reference" content="citation_title=Vertical transmission of reticuloendotheliosis virus in breeder turkeys;citation_author=R.L. Witter;citation_author=D.W. Salter;citation_journal_title=Avian Dis;citation_volume=33;citation_number=33;citation_first_page=226;citation_last_page=235;citation_publication_date=1989;"/>
+<meta name="citation_reference" content="citation_title=An outbreak of lymphomas in commercial broiler breeder chickens vaccinated with a fowlpox vaccine contaminated with reticuloendotheliosis virus;citation_author=A.M. Fadly;citation_author=R.L. Witter;citation_author=E.J. Smith;citation_author=R.F. Silva;citation_author=W.M. Reed;citation_author=F.J Hoerr;citation_author=M.R. Putnam;citation_journal_title=Avian Pathol;citation_volume=25;citation_number=25;citation_issue=1;citation_first_page=35;citation_last_page=47;citation_publication_date=1996;"/>
+<meta name="citation_reference" content="citation_title=Detection of reticuloendotheliosis virus in live virus vaccines of poultry;citation_author=A Fadly;citation_author=M.C. Garcia;citation_journal_title=Dev. Biol;citation_volume=126;citation_number=126;citation_first_page=301;citation_last_page=305;citation_publication_date=2005;"/>
+<meta name="citation_reference" content="citation_title=Detection of reticuloendotheliosis virus as a contaminant of fowl pox vaccines;citation_author=A.M. Awad;citation_author=H.S. Abd El-Hamid;citation_author=A.A. Abou Rawash;citation_author=H.H. Ibrahim;citation_journal_title=Poult. Sci;citation_volume=89;citation_number=89;citation_issue=11;citation_first_page=2389;citation_last_page=2395;citation_publication_date=2010;"/>
+<meta name="citation_reference" content="citation_title=Isolation, identification, and whole genome sequencing of reticuloendotheliosis virus from a vaccine against Marek’s disease;citation_author=J.P. Li;citation_author=X. Dong;citation_author=C. Yang;citation_author=Q.H. Li;citation_author=Z. Cui;citation_author=S. Chang;citation_author=P. Zhao;citation_author=K.Z Yu;citation_author=C. Yang;citation_journal_title=Poult. Sci;citation_volume=94;citation_number=94;citation_issue=4;citation_first_page=643;citation_last_page=649;citation_publication_date=2015;"/>
+<meta name="citation_reference" content="citation_title=Isolation of Reticuloendotheliosis Virus from a fowlpox live vaccine and env gene sequence analysis;citation_author=J Wang;citation_author=Z Li;citation_author=P Zhao;citation_author=H Chen;citation_author=Z Cui;citation_journal_title=Chinese Journal of Animal Infectious Diseases;citation_volume=18;citation_number=18;citation_first_page=35;citation_last_page=39;citation_publication_date=2010;"/>
+<meta name="citation_reference" content="citation_title=Probable congenital transmission of reticuloendotheliosis virus caused by vaccination with contaminated vaccines;citation_author=K. Wei;citation_author=Z. Sun;citation_author=S. Zhu;citation_author=W. Guo;citation_author=P. Sheng;citation_author=P. Wang;citation_author=C. Zhao;citation_author=Q. Zhao;citation_author=R. Zhu;citation_journal_title=PLoS One;citation_volume=7;citation_number=7;citation_first_page=e43422;citation_publication_date=2012;"/>
+<meta name="citation_reference" content="citation_title=Isolation of a reticuloendotheliosis virus from chickens inoculated with Marek’s disease vaccine;citation_author=N. Yuasa;citation_author=I. Yoshida;citation_author=T. Taniguchi;citation_journal_title=Natl. Inst. Anim. Health Q;citation_volume=16;citation_number=16;citation_issue=4;citation_first_page=141;citation_last_page=151;citation_publication_date=1976;"/>
+<meta name="citation_reference" content="citation_title=Infection studies on a reticuloendotheliosis virus contaminant of a commercial Marek’s disease vaccine;citation_author=T. J. Bagust;citation_author=T. M. Grimes;citation_author=D. P. Dennett;citation_journal_title=Aust Vet J;citation_volume=55;citation_number=55;citation_issue=4;citation_first_page=153;citation_last_page=157;citation_publication_date=1979;"/>
+<meta name="citation_reference" content="citation_title=Field isolates of fowlpox virus contaminated with reticuloendotheliosis virus;citation_author=I. S. Diallo;citation_author=M. A. Mackenzie;citation_author=P. B. Spradbrow;citation_journal_title=Avian pathol;citation_volume=27;citation_number=27;citation_issue=1;citation_first_page=60;citation_last_page=66;citation_publication_date=1998;"/>
+<meta name="citation_reference" content="citation_title=Field and vaccine strains of fowlpox virus carry integrated sequences from the avian retrovirus, reticuloendotheliosis virus;citation_author=C Hertig;citation_author=B. E. Coupar;citation_author=A. R. Gould;citation_author=D.B. Boyle;citation_journal_title=Virology;citation_volume=235;citation_number=235;citation_issue=2;citation_first_page=367;citation_last_page=376;citation_publication_date=1997;"/>
+<meta name="citation_reference" content="citation_title=Reticuloendotheliosis virus (REV) long terminal repeats incorporated in the genomes of commercial fowl poxvirus vaccines and pigeon poxviruses without indication of the presence of infectious REV;citation_author=K. M. Moore;citation_author=J. R Davis;citation_author=T Sato;citation_journal_title=Avian Dis;citation_volume=44;citation_number=44;citation_issue=4;citation_first_page=827;citation_last_page=841;citation_publication_date=2000;"/>
+<meta name="citation_reference" content="citation_title=In vivo events of retroiral long terminal repeat integration into Marek’s disease virus in commerial poultry: detection of chimeric molecules as a marker;citation_author=I Davidson;citation_author=R. Borenshtain;citation_journal_title=Avian Disease;citation_volume=45;citation_number=45;citation_issue=1;citation_first_page=102;citation_last_page=121;citation_publication_date=2001;"/>
+<meta name="citation_reference" content="citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;"/>
+<meta name="citation_reference" content="citation_title=A BAC clone of MDV strain GX0101 with REV-LTR integration retained its pathogenicity;citation_author=A J. Sun;citation_author=L. P. Petherbridge;citation_author=Y. G. Zhao;citation_author=Y. P. Li;citation_author=K. Nair. Venugopal;citation_author=Z.Z Cui;citation_journal_title=Chinese Science Bulletin;citation_volume=54;citation_number=54;citation_issue=15;citation_first_page=2641;citation_last_page=2647;citation_publication_date=2009;"/>
+<meta name="citation_reference" content="citation_title=Detection of fowl poxvirus integrated with reticuloendotheliosis virus sequences from an outbreak in backyard chickens in India;citation_author=S. K. Biswas;citation_author=C. Jana;citation_author=K. Chand;citation_author=W. Rehman;citation_author=B. Mondal;citation_journal_title=Vet Ital;citation_volume=47;citation_number=47;citation_issue=2;citation_first_page=147;citation_last_page=513;citation_publication_date=2010;"/>
+<meta name="citation_reference" content="citation_title=Functional evaluation of the role of reticuloendotheliosis virus long terminal repeat (LTR) integrated into the genome of a field strain of Marek’s disease virus;citation_author=A. J. Sun;citation_author=X. Y. Xu;citation_author=L. Petherbridge;citation_author=Y. G. Zhao;citation_author=V. Nair;citation_author=Z. Z Cui;citation_journal_title=Virology;citation_volume=397;citation_number=397;citation_issue=2;citation_first_page=270;citation_last_page=276;citation_publication_date=2010;"/>
+<meta name="citation_reference" content="citation_title=Protective efficacy of vaccination against highly pathogenic avian influenza is dramatically suppressed by early infection of chickens with reticuloendotheliosis virus;citation_author=S.H. Sun;citation_author=Z.Z. Cui;citation_author=J Wang;citation_author=Z. L Wang;citation_journal_title=Avian Pathol;citation_volume=38;citation_number=38;citation_first_page=31;citation_last_page=34;citation_publication_date=2009;"/>
+<meta name="citation_reference" content="citation_title=Depression of vaccinal immunity to Marek’s disease by infection with reticuloendotheliosis virus;citation_author=R. L. Witter;citation_author=L. F. Lee;citation_author=L. D. Bacon;citation_author=E. J. Smith;citation_journal_title=Infection and Immunity;citation_volume=26;citation_number=26;citation_first_page=90;citation_last_page=98;citation_publication_date=1979;"/>
+<meta name="citation_reference" content="citation_title=Sequencing and analysis of whole genome nucleotide sequence of Chinese REV isolate HA9901;citation_author=Y. Wang;citation_author=Z. Cui;citation_author=S. Jiang;citation_journal_title=Science in China Serices C: Life Sciences;citation_volume=35;citation_number=35;citation_first_page=340;citation_last_page=380;citation_publication_date=2005;"/>
+
+
+<!-- DoubleClick overall ad setup script -->
+<script type='text/javascript'>
+ var googletag = googletag || {};
+ googletag.cmd = googletag.cmd || [];
+ (function() {
+ var gads = document.createElement('script');
+ gads.async = true;
+ gads.type = 'text/javascript';
+ var useSSL = 'https:' == document.location.protocol;
+ gads.src = (useSSL ? 'https:' : 'http:') +
+ '//www.googletagservices.com/tag/js/gpt.js';
+ var node = document.getElementsByTagName('script')[0];
+ node.parentNode.insertBefore(gads, node);
+ })();
+</script>
+
+<!-- DoubleClick ad slot setup script -->
+
+ <script id="doubleClickSetupScript" type='text/javascript'>
+ googletag.cmd.push(function() {
+ googletag.defineSlot('/75507958/PONE_728x90_ATF', [728, 90], 'div-gpt-ad-1458247671871-0').addService(googletag.pubads());
+ googletag.defineSlot('/75507958/PONE_160x600_BTF', [160, 600], 'div-gpt-ad-1458247671871-1').addService(googletag.pubads());
+ googletag.pubads().enableSingleRequest();
+ googletag.enableServices();
+ });
+ </script>
+
+
+
+<script type="text/javascript">
+ var WombatConfig = WombatConfig || {};
+ WombatConfig.resourcePath = "/plosone/resource/";
+ WombatConfig.imgPath = "/plosone/resource/img/";
+ WombatConfig.journalKey = "PLoSONE";
+ WombatConfig.figurePath = "/plosone/article/figure/image";
+ WombatConfig.figShareInstitutionString = "plos";
+ WombatConfig.doiResolverPrefix = "https://dx.plos.org/";
+</script>
+
+<script type="text/javascript">
+ var WombatConfig = WombatConfig || {};
+ WombatConfig.metrics = WombatConfig.metrics || {};
+ WombatConfig.metrics.referenceUrl = "http://lagotto.io/plos";
+ WombatConfig.metrics.googleScholarUrl = "https://scholar.google.com/scholar";
+ WombatConfig.metrics.googleScholarCitationUrl = WombatConfig.metrics.googleScholarUrl + "?hl=en&lr=&q=";
+ WombatConfig.metrics.crossrefUrl = "https://www.crossref.org";
+</script>
+<script src="https://code.jquery.com/jquery-2.1.4.min.js" ></script>
+<script>window.jQuery || document.write('<script src="/plosone/resource/js/vendor/jquery-2.1.4.min.js""><\/script>')</script>
+
+ <script type="text/javascript" src="https://widgets.figshare.com/static/figshare.js"></script>
+
+
+
+
+
+
+
+
+
+
+
+</head>
+
+
+
+<body class="article plosone">
+
+
+
+
+
+<header>
+
+ <div id="topslot" class="head-top">
+
+<div class="center">
+<div class="title">Advertisement</div>
+<!-- DoubleClick Ad Zone -->
+ <div class='advertisement' id='div-gpt-ad-1458247671871-0' style='width:728px; height:90px;'>
+ <script type='text/javascript'>
+ googletag.cmd.push(function() { googletag.display('div-gpt-ad-1458247671871-0'); });
+ </script>
+ </div>
+</div>
+ </div>
+
+ <div id="user" class="nav">
+ <ul class="nav-user">
+
+
+
+
+ <li ><a href="https://www.plos.org">plos.org</a></li>
+
+
+ <li ><a href="https://community.plos.org/registration/new">create account</a></li>
+
+
+ <li class="highlighted"><a href="/plosone/user/secure/login?page=%2Fplosone%2Farticle%3Fid%3D10.1371%2Fjournal.pone.0213978">sign in</a></li>
+
+ </ul>
+ </div>
+ <div id="pagehdr">
+
+ <nav class="nav-main">
+
+
+
+
+<h1 class="logo">
+ <a href="/plosone/.">PLOS ONE</a>
+</h1>
+
+<section class="top-bar-section">
+
+<ul class="nav-elements">
+
+
+ <li class="multi-col-parent menu-section-header has-dropdown" id="publish">
+ Publish
+ <div class="dropdown mega ">
+ <ul class="multi-col" id="publish-dropdown-list">
+
+ <li class="menu-section-header " id="submissions">
+ <span class="menu-section-header-title"> Submissions </span>
+
+ <ul class="menu-section "
+ id="submissions-dropdown-list">
+ <li>
+ <a href="/plosone/s/getting-started" >Getting Started</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/submission-guidelines" >Submission Guidelines</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/figures" >Figures</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/tables" >Tables</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/supporting-information" >Supporting Information</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/latex" >LaTeX</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/preprints" >Preprints</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/revising-your-manuscript" >Revising Your Manuscript</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/submit-now" >Submit Now</a>
+ </li>
+
+ <li>
+ <a href="https://collections.plos.org/s/calls-for-papers" >Calls for Papers</a>
+ </li>
+
+ </ul>
+
+ </li>
+
+
+ <li class="menu-section-header " id="policies">
+ <span class="menu-section-header-title"> Policies </span>
+
+ <ul class="menu-section "
+ id="policies-dropdown-list">
+ <li>
+ <a href="/plosone/s/best-practices-in-research-reporting" >Best Practices in Research Reporting</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/human-subjects-research" >Human Subjects Research</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/animal-research" >Animal Research</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/competing-interests" >Competing Interests</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/disclosure-of-funding-sources" >Disclosure of Funding Sources</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/licenses-and-copyright" >Licenses and Copyright</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/data-availability" >Data Availability</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/materials-and-software-sharing" >Materials and Software Sharing</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/ethical-publishing-practice" >Ethical Publishing Practice</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/authorship" >Authorship</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/downloads-and-translations" >Downloads and Translations</a>
+ </li>
+
+ </ul>
+
+ </li>
+
+
+ <li class="menu-section-header " id="manuscript-review-and-publication">
+ <span class="menu-section-header-title"> Manuscript Review and Publication </span>
+
+ <ul class="menu-section "
+ id="manuscript-review-and-publication-dropdown-list">
+ <li>
+ <a href="/plosone/s/criteria-for-publication" >Criteria for Publication</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/editorial-and-peer-review-process" >Editorial and Peer Review Process</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/editor-center" >Editor Center</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/reviewer-guidelines" >Guidelines for Reviewers</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/accepted-manuscripts" >Accepted Manuscripts</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/corrections-and-retractions" >Corrections and Retractions</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/comments" >Comments</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/article-level-metrics" >Article-Level Metrics</a>
+ </li>
+
+ </ul>
+
+ </li>
+ </ul>
+ <div class="calloutcontainer">
+
+
+
+ <h3 class="callout-headline">Submit Your Manuscript</h3>
+
+ <div class="action-contain">
+ <p class="callout-content">
+ Discover a faster, simpler path to publishing in a high-quality journal. <em>PLOS ONE</em> promises fair, rigorous peer review,
+ broad scope, and wide readership – a perfect fit for your research every time.
+ </p>
+
+ <p class="button-contain special">
+ <a class="button button-default" href="/plosone/static/publish">
+ Learn More
+ </a>
+ <a class="button-link" href="https://www.editorialmanager.com/pone/default.asp">
+ Submit Now
+ </a>
+ </p>
+ </div> <!-- opens in siteMenuCalloutDescription -->
+
+
+ </div>
+ </div>
+ </li>
+
+
+
+ <li class="menu-section-header has-dropdown " id="about">
+ <span class="menu-section-header-title"> About </span>
+
+ <ul class="menu-section dropdown "
+ id="about-dropdown-list">
+ <li>
+ <a href="/plosone/static/publish" >Why Publish with PLOS ONE</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/journal-information" >Journal Information</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/staff-editors" >Staff Editors</a>
+ </li>
+
+ <li>
+ <a href="/plosone/static/editorial-board" >Editorial Board</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/section-editors" >Section Editors</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/advisory-groups" >Advisory Groups</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/find-and-read-articles" >Find and Read Articles</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/publishing-information" >Publishing Information</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/publication-fees" >Publication Fees</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/press-and-media" >Press and Media</a>
+ </li>
+
+ <li>
+ <a href="/plosone/s/contact" >Contact</a>
+ </li>
+
+ </ul>
+
+ </li>
+
+ <li data-js-tooltip-hover="trigger" class="subject-area menu-section-header">
+ Browse
+ </li>
+
+ <li id="navsearch" class="head-search">
+
+
+ <form name="searchForm" action="/plosone/search" method="get">
+ <fieldset>
+ <legend>Search</legend>
+ <label for="search">Search</label>
+ <div class="search-contain">
+ <input id="search" type="text" name="q" placeholder="SEARCH" required/>
+ <button id="headerSearchButton" type="submit"><span class="search-icon"></span></button>
+ </div>
+ </fieldset>
+ <input type="hidden" name="filterJournals" value="PLoSONE"/>
+ </form>
+
+ <a id="advSearch"
+ href="/plosone/search">
+ advanced search
+ </a>
+
+
+
+
+ </li>
+
+ </ul>
+ </section>
+ </nav>
+ </div>
+
+</header><section id="taxonomyContainer">
+
+<div id="taxonomy-browser" class="areas" data-search-url="/plosone/browse">
+ <div class="wrapper">
+ <div class="taxonomy-header">
+ Browse Subject Areas
+ <div id="subjInfo">?</div>
+ <div id="subjInfoText">
+ <p>Click through the PLOS taxonomy to find articles in your field.</p>
+ <p>For more information about PLOS Subject Areas, click
+ <a href="https://github.com/PLOS/plos-thesaurus/blob/develop/README.md" target="_blank" title="Link opens in new window">here</a>.
+ </p>
+ </div>
+ </div>
+ <div class="levels">
+ <div class="levels-container cf">
+ <div class="levels-position"></div>
+ </div>
+ <a href="#" class="prev"></a>
+ <a href="#" class="next active"></a>
+ </div>
+ </div>
+ <div class="taxonomy-browser-border-bottom"></div>
+</div></section>
+<main> <div class="set-grid">
+
+<header class="title-block">
+
+
+
+<script type="text/javascript">
+ var COUNTER_HOST = "https://counter.plos.org/api/v1.0/stats/totals/doi";
+</script>
+
+<script type="text/javascript">
+ var ALM_CONFIG = ALM_CONFIG || {};
+ ALM_CONFIG.hostname = "https://alm.plos.org";
+ ALM_CONFIG.apiKey = "3pezRBRXdyzYW6ztfwft";
+ ALM_CONFIG.host = "https://alm.plos.org/api/v5/articles";
+</script>
+
+<ul id="almSignposts" class="signposts">
+ <li id="loadingMetrics">
+ <p>Loading metrics</p>
+ </li>
+</ul>
+
+<script type="text/template" id="signpostsGeneralErrorTemplate">
+ <li id="metricsError">Article metrics are unavailable at this time. Please try again later.</li>
+</script>
+
+<script type="text/template" id="signpostsNewArticleErrorTemplate">
+ <li></li><li></li><li id="tooSoon">Article metrics are unavailable for recently published articles.</li>
+</script>
+
+<script type="text/template" id="signpostsTemplate">
+ <li id="almSaves">
+ <%= s.numberFormat(saveCount, 0) %>
+ <div class="tools" data-js-tooltip-hover="trigger">
+ <a class="metric-term" href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#savedHeader">Save</a>
+ <p class="saves-tip" data-js-tooltip-hover="target"><a href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#savedHeader">Total Mendeley bookmarks.</a></p>
+ </div>
+ </li>
+
+ <li id="almCitations">
+ <%= s.numberFormat(citationCount, 0) %>
+ <div class="tools" data-js-tooltip-hover="trigger">
+ <a class="metric-term" href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#citedHeader">Citation</a>
+ <p class="citations-tip" data-js-tooltip-hover="target"><a href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#citedHeader">Paper's citation count computed by Scopus.</a></p>
+ </div>
+ </li>
+
+ <li id="almViews">
+ <%= s.numberFormat(viewCount, 0) %>
+ <div class="tools" data-js-tooltip-hover="trigger">
+ <a class="metric-term" href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#viewedHeader">View</a>
+ <p class="views-tip" data-js-tooltip-hover="target"><a href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#viewedHeader">Sum of PLOS and PubMed Central page views and downloads.</a></p>
+ </div>
+ </li>
+
+ <li id="almShares">
+ <%= s.numberFormat(shareCount, 0) %>
+ <div class="tools" data-js-tooltip-hover="trigger">
+ <a class="metric-term" href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#discussedHeader">Share</a>
+ <p class="shares-tip" data-js-tooltip-hover="target"><a href="/plosone/article/metrics?id=10.1371/journal.pone.0213978#discussedHeader">Sum of Facebook and Twitter activity.</a></p>
+ </div>
+ </li>
+</script>
+
+ <div class="article-meta">
+
+<div class="classifications">
+ <p class="license-short" id="licenseShort">Open Access</p>
+ <p class="peer-reviewed" id="peerReviewed">Peer-reviewed</p>
+
+<div class="article-type" >
+ <p class="type-article" id="artType">Research Article</p>
+</div>
+
+
+</div>
+
+
+ </div>
+ <div class="article-title-etc">
+
+
+
+<div class="title-authors">
+ <h1 id="artTitle"><?xml version="1.0" encoding="UTF-8"?>Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody</h1>
+
+<ul class="author-list clearfix" data-js-tooltip="tooltip_container" id="author-list">
+
+
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="0" class="author-name" >
+Yang Li <span class="contribute"> </span>,</a> <div id="author-meta-0" class="author-info" data-js-tooltip="tooltip_target">
+
+ <p>
+ <span class="contribute"> </span> Contributed equally to this work with:
+ Yang Li,
+ Tuanjie Wang
+ </p>
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Methodology,
+
+ Project administration,
+
+ Resources,
+
+ Writing – original draft
+ </p>
+
+ <p id="authAffiliations-0"><span class="type">Affiliation</span>
+ China Animal Health and Epidemiology Center, Qingdao, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose0"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="1" class="author-name" >
+Tuanjie Wang <span class="contribute"> </span>,</a> <div id="author-meta-1" class="author-info" data-js-tooltip="tooltip_target">
+
+ <p>
+ <span class="contribute"> </span> Contributed equally to this work with:
+ Yang Li,
+ Tuanjie Wang
+ </p>
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Project administration
+ </p>
+
+ <p id="authAffiliations-1"><span class="type">Affiliation</span>
+ China Institute of Veterinary Drug Control, Beijing, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose1"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="2" class="author-name" >
+Lin Wang,</a> <div id="author-meta-2" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Methodology
+ </p>
+
+ <p id="authAffiliations-2"><span class="type">Affiliation</span>
+ China Animal Health and Epidemiology Center, Qingdao, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose2"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="3" class="author-name" >
+Mingjun Sun,</a> <div id="author-meta-3" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Resources
+ </p>
+
+ <p id="authAffiliations-3"><span class="type">Affiliation</span>
+ China Animal Health and Epidemiology Center, Qingdao, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose3"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="4" class="author-name" >
+Zhizhong Cui,</a> <div id="author-meta-4" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Supervision
+ </p>
+
+ <p id="authAffiliations-4"><span class="type">Affiliation</span>
+ College of Veterinary Medicine, Shandong Agricultural University, Taian, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose4"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="5" class="author-name" >
+Shuang Chang,</a> <div id="author-meta-5" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Methodology
+ </p>
+
+ <p id="authAffiliations-5"><span class="type">Affiliation</span>
+ College of Veterinary Medicine, Shandong Agricultural University, Taian, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose5"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="6" class="author-name" >
+Yongping Wu,</a> <div id="author-meta-6" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Methodology
+ </p>
+
+ <p id="authAffiliations-6"><span class="type">Affiliation</span>
+ College of Animal Sciences and Technology, Zhejiang A&F University, Hangzhou, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose6"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="7" class="author-name" >
+Xiaodong Zhang,</a> <div id="author-meta-7" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Methodology
+ </p>
+
+ <p id="authAffiliations-7"><span class="type">Affiliation</span>
+ College of Animal Sciences and Technology, Zhejiang A&F University, Hangzhou, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose7"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="8" class="author-name" >
+Xiaohui Yu <span class="email"> </span>,</a> <div id="author-meta-8" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Data curation,
+
+ Investigation,
+
+ Writing – review & editing
+ </p>
+ <p id="authCorresponding-8"> <span class="email">* E-mail:</span> <a href="mailto:suntaosdciq@163.com">suntaosdciq@163.com</a> (TS); <a href="mailto:619334017@qq.com">619334017@qq.com</a> (PZ); <a href="mailto:yxhui1030@126.com">yxhui1030@126.com</a> (XY)</p>
+ <p id="authAffiliations-8"><span class="type">Affiliation</span>
+ China Animal Health and Epidemiology Center, Qingdao, China
+ </p>
+ <div>
+ <p class="orcid" id="authOrcid-8">
+ <span>
+ <a id="connect-orcid-link" href="http://orcid.org/0000-0003-0555-8727" target="_blank" title="ORCID Registry">
+ <img id="orcid-id-logo" src="/plosone/resource/img/orcid_16x16.png" width="16" height="16" alt="ORCID logo"/>
+ </a>
+ </span>
+ <a href="http://orcid.org/0000-0003-0555-8727">http://orcid.org/0000-0003-0555-8727</a>
+ </p>
+ </div>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose8"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="9" class="author-name" >
+Tao Sun <span class="email"> </span>,</a> <div id="author-meta-9" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Data curation
+ </p>
+ <p id="authCorresponding-9"> <span class="email">* E-mail:</span> <a href="mailto:suntaosdciq@163.com">suntaosdciq@163.com</a> (TS); <a href="mailto:619334017@qq.com">619334017@qq.com</a> (PZ); <a href="mailto:yxhui1030@126.com">yxhui1030@126.com</a> (XY)</p>
+ <p id="authAffiliations-9"><span class="type">Affiliation</span>
+ Shandong Entry-exit Inspection and Quarantine Bureau, Qingdao, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose9"> &#x02A2F; </a>
+ </div>
+</li>
+
+<li
+ data-js-tooltip="tooltip_trigger"
+
+>
+ <a data-author-id="10" class="author-name" >
+Peng Zhao <span class="email"> </span></a> <div id="author-meta-10" class="author-info" data-js-tooltip="tooltip_target">
+
+
+ <p class="roles" id="authRoles">
+ <span class="type">Roles</span>
+ Supervision,
+
+ Validation
+ </p>
+ <p id="authCorresponding-10"> <span class="email">* E-mail:</span> <a href="mailto:suntaosdciq@163.com">suntaosdciq@163.com</a> (TS); <a href="mailto:619334017@qq.com">619334017@qq.com</a> (PZ); <a href="mailto:yxhui1030@126.com">yxhui1030@126.com</a> (XY)</p>
+ <p id="authAffiliations-10"><span class="type">Affiliation</span>
+ College of Veterinary Medicine, Shandong Agricultural University, Taian, China
+ </p>
+
+ <a data-js-tooltip="tooltip_close" class="close" id="tooltipClose10"> &#x02A2F; </a>
+ </div>
+</li>
+
+</ul>
+
+</div>
+
+
+<div id="floatTitleTop" data-js-floater="title_author" class="float-title">
+ <div class="set-grid">
+ <div class="float-title-inner">
+ <h1><?xml version="1.0" encoding="UTF-8"?>Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody</h1>
+
+<ul id="floatAuthorList" data-js-floater="floated_authors">
+
+ <li data-float-index="1">Yang Li,&nbsp;
+
+ </li>
+ <li data-float-index="2">Tuanjie Wang,&nbsp;
+
+ </li>
+ <li data-float-index="3">Lin Wang,&nbsp;
+
+ </li>
+ <li data-float-index="4">Mingjun Sun,&nbsp;
+
+ </li>
+ <li data-float-index="5">Zhizhong Cui,&nbsp;
+
+ </li>
+ <li data-float-index="6">Shuang Chang,&nbsp;
+
+ </li>
+ <li data-float-index="7">Yongping Wu,&nbsp;
+
+ </li>
+ <li data-float-index="8">Xiaodong Zhang,&nbsp;
+
+ </li>
+ <li data-float-index="9">Xiaohui Yu,&nbsp;
+
+ </li>
+ <li data-float-index="10">Tao Sun
+
+</ul>
+
+
+
+ </div>
+ <div class="logo-close" id="titleTopCloser">
+ <img src="/plosone/resource/img/logo.plos.95.png" alt="PLOS" />
+ <div class="close-floater" title="close">x</div>
+ </div>
+ </div>
+</div>
+
+ <ul class="date-doi">
+ <li id="artPubDate">Published: April 22, 2019</li>
+ <li id="artDoi">
+<a href="https://doi.org/10.1371/journal.pone.0213978">https://doi.org/10.1371/journal.pone.0213978</a>
+ </li>
+ </ul>
+
+ </div>
+ <div>
+
+ </div>
+</header>
+
+ <section class="article-body">
+
+
+
+<ul class="article-tabs">
+
+ <li class="tab-title active" id="tabArticle">
+ <a href="/plosone/article?id=10.1371/journal.pone.0213978" class="article-tab-1">Article</a>
+ </li>
+
+
+ <li class="tab-title " id="tabAuthors">
+ <a href="/plosone/article/authors?id=10.1371/journal.pone.0213978" class="article-tab-2">Authors</a>
+ </li>
+
+
+ <li class="tab-title " id="tabMetrics">
+ <a href="/plosone/article/metrics?id=10.1371/journal.pone.0213978" class="article-tab-3">Metrics</a>
+ </li>
+
+
+ <li class="tab-title " id="tabComments">
+ <a href="/plosone/article/comments?id=10.1371/journal.pone.0213978" class="article-tab-4">Comments</a>
+ </li>
+
+ <li class="tab-title " id="tabRelated">
+ <a href="/plosone/article/related?id=10.1371/journal.pone.0213978" class="article-tab-5">Media Coverage</a>
+ </li>
+
+</ul>
+
+ <div class="article-container">
+
+
+<div id="nav-article">
+ <ul class="nav-secondary">
+
+ <li class="nav-comments" id="nav-comments">
+ <a href="article/comments?id=10.1371/journal.pone.0213978">Reader Comments (0)</a>
+ </li>
+
+ <li class="nav-media" id="nav-media" data-doi="10.1371/journal.pone.0213978">
+ <a href="/plosone/article/related?id=10.1371/journal.pone.0213978">
+ Media Coverage <span id="media-coverage-count"></span>
+ </a>
+ </li>
+
+ <li id="nav-figures"><a href="#" data-doi="10.1371/journal.pone.0213978">Figures</a></li>
+ </ul>
+</div>
+
+<div id="figure-lightbox-container"></div>
+
+<script id="figure-lightbox-template" type="text/template">
+ <div id="figure-lightbox" class="reveal-modal full" data-reveal aria-hidden="true"
+ role="dialog">
+ <div class="lb-header">
+ <h1 id="lb-title"><%= articleTitle %></h1>
+
+ <div id="lb-authors">
+ <span>Yang Li</span>
+ <span>Tuanjie Wang</span>
+ <a class="more-authors" href="/plosone/article/authors?id=10.1371/journal.pone.0213978">...</a>
+ <span>Peng Zhao</span>
+ </div>
+
+ <div class="lb-close" title="close">&nbsp;</div>
+ </div>
+ <div class="img-container">
+ <div class="loader"> <i class="fa-spinner"></i> </div>
+ <img class="main-lightbox-image" src=""/>
+ <aside id="figures-list">
+ <% figureList.each(function (ix, figure) { %>
+ <div class="change-img" data-doi="<%= figure.getAttribute('data-doi') %>">
+ <img class="aside-figure" src="/plosone/article/figure/image?size=inline&id=<%= figure.getAttribute('data-doi') %>" />
+ </div>
+ <% }) %>
+ <div class="dummy-figure">
+ </div>
+ </aside>
+ </div>
+ <div id="lightbox-footer">
+
+ <div id="btns-container" class="lightbox-row <% if(figureList.length <= 1) { print('one-figure-only') } %>">
+ <div class="fig-btns-container reset-zoom-wrapper left">
+ <span class="fig-btn reset-zoom-btn">Reset zoom</span>
+ </div>
+ <div class="zoom-slider-container">
+ <div class="range-slider-container">
+ <span id="lb-zoom-min"></span>
+ <div class="range-slider round" data-slider data-options="start: 20; end: 200; initial: 20;">
+ <span class="range-slider-handle" role="slider" tabindex="0"></span>
+ <span class="range-slider-active-segment"></span>
+ <input type="hidden">
+ </div>
+ <span id="lb-zoom-max"></span>
+ </div>
+ </div>
+ <% if(figureList.length > 1) { %>
+ <div class="fig-btns-container">
+ <span class="fig-btn all-fig-btn"><i class="icon icon-all"></i> All Figures</span>
+ <span class="fig-btn next-fig-btn"><i class="icon icon-next"></i> Next</span>
+ <span class="fig-btn prev-fig-btn"><i class="icon icon-prev"></i> Previous</span>
+ </div>
+ <% } %>
+ </div>
+ <div id="image-context">
+ </div>
+ </div>
+ </div>
+</script>
+
+<script id="image-context-template" type="text/template">
+ <div class="footer-text">
+ <div id="figure-description-wrapper">
+ <div id="view-more-wrapper" style="<% descriptionExpanded? print('display:none;') : '' %>">
+ <span id="figure-title"><%= title %></span>
+ <p id="figure-description">
+ <%= description %>&nbsp;&nbsp;
+ </p>
+ <span id="view-more">show more<i class="icon-arrow-right"></i></span>
+
+ </div>
+ <div id="view-less-wrapper" style="<% descriptionExpanded? print('display:inline-block;') : '' %>" >
+ <span id="figure-title"><%= title %></span>
+ <p id="full-figure-description">
+ <%= description %>&nbsp;&nbsp;
+ <span id="view-less">show less<i class="icon-arrow-left"></i></span>
+ </p>
+ </div>
+ </div>
+ </div>
+ <div id="show-context-container">
+ <a class="btn show-context" href="<%= showInContext(strippedDoi) %>">Show in Context</a>
+ </div>
+ <div id="download-buttons">
+ <h3>Download:</h3>
+ <div class="item">
+ <a href="/plosone/article/figure/image?size=original&download=&id=<%= doi %>" title="original image">
+ <span class="download-btn">TIFF</span>
+ </a>
+ <span class="file-size"><%= fileSizes.original %></span>
+ </div>
+ <div class="item">
+ <a href="/plosone/article/figure/image?size=large&download=&id=<%= doi %>" title="large image">
+ <span class="download-btn">PNG</span>
+ </a>
+ <span class="file-size"><%= fileSizes.large %></span>
+ </div>
+ <div class="item">
+ <a href="/plosone/article/figure/powerpoint?id=<%= doi %>" title="PowerPoint slide">
+ <span class="download-btn">PPT</span>
+ </a>
+ </div>
+
+ </div>
+</script>
+ <div class="article-content">
+
+
+
+
+
+
+<div id="figure-carousel-section">
+ <h2>Figures</h2>
+
+ <div id="figure-carousel">
+
+ <div class="carousel-wrapper">
+ <div class="slider">
+
+ <div class="carousel-item lightbox-figure" data-doi="10.1371/journal.pone.0213978.t001">
+
+ <img src="/plosone/article/figure/image?size=inline&amp;id=10.1371/journal.pone.0213978.t001"
+ alt="Table 1"
+ />
+
+ </div>
+
+ <div class="carousel-item lightbox-figure" data-doi="10.1371/journal.pone.0213978.t002">
+
+ <img src="/plosone/article/figure/image?size=inline&amp;id=10.1371/journal.pone.0213978.t002"
+ alt="Table 2"
+ />
+
+ </div>
+
+ <div class="carousel-item lightbox-figure" data-doi="10.1371/journal.pone.0213978.t003">
+
+ <img src="/plosone/article/figure/image?size=inline&amp;id=10.1371/journal.pone.0213978.t003"
+ alt="Table 3"
+ />
+
+ </div>
+ </div>
+ </div>
+
+ <div class="carousel-control">
+ <span class="button previous"></span>
+ <span class="button next"></span>
+ </div>
+ <div class="carousel-page-buttons">
+
+ </div>
+ </div>
+</div>
+
+
+ <div class="article-text" id="artText">
+
+
+
+
+<div class="abstract toc-section"><a id="abstract0" name="abstract0" data-toc="abstract0" class="link-target" title="Abstract"></a><h2>Abstract</h2><a id="article1.front1.article-meta1.abstract1.p1" name="article1.front1.article-meta1.abstract1.p1" class="link-target"></a><p>Reticuloendotheliosis virus (REV) is the most frequent exogenous virus that contaminates attenuated vaccines. Therefore, it is extremely important to select REV-free specific-pathogen-free (SPF) chicken embryos. Generally, REV infection is assessed by detecting REV antibodies in SPF chickens. This present study seeks to evaluate REV infection by replacing serum antibody detection with yolk antibody detection. A cohort of 40 nineteen-week-old SPF chickens were artificially inoculated with REV, with 32 SPF chickens raised in another isolation environment served as a blank control. Eggs and serum from 23-week-old chickens were sampled, and yolks were diluted separately to ratios of 1:150, 1:200, 1:300 and 1:400, which were detected together with serum. We found that the yolk antibody detection findings at a dilution of 1:300 had the highest coincidence rate compared with that based on serum antibody measurements. At a dilution ratio of 1:300 for yolk antibody, 72 chickens were continuously observed for 10 weeks from 25- to 34-weeks-old. Our findings were based on serum antibody or yolk antibody detection, and the evaluation results were completely consistent. Therefore, all serum antibody-positive chickens were yolk antibody-positive, and vice versa. Accordingly, vaccine producers can estimate REV cleanliness in a poultry farm by sampling yolk antibody titers.</p>
+</div>
+
+
+<div class="articleinfo"><p><strong>Citation: </strong>Li Y, Wang T, Wang L, Sun M, Cui Z, Chang S, et al. (2019) Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody. PLoS ONE 14(4):
+ e0213978.
+
+ https://doi.org/10.1371/journal.pone.0213978</p><p><strong>Editor: </strong>Eric HY Lau, The University of Hong Kong, CHINA</p><p><strong>Received: </strong>June 22, 2018; <strong>Accepted: </strong>March 5, 2019; <strong>Published: </strong> April 22, 2019</p><p><strong>Copyright: </strong> © 2019 Li et al. This is an open access article distributed under the terms of the <a href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License</a>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</p><p><strong>Data Availability: </strong>All relevant data are within the manuscript.</p><p><strong>Funding: </strong>The research was supported by the National Quality Infrastructure of China (2017YFF0210200).The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</p><p><strong>Competing interests: </strong> The authors have declared that no competing interests exist.</p></div>
+
+
+
+
+
+<div id="section1" class="section toc-section"><a id="sec001" name="sec001" data-toc="sec001" class="link-target" title="Introduction"></a><h2>Introduction</h2><a id="article1.body1.sec1.p1" name="article1.body1.sec1.p1" class="link-target"></a><p>Avian reticuloendotheliosis virus (REV) is one of the most important pathogens that can cause avian tumors. Recently, epidemiological investigations showed that REV infection is very common in Chinese chickens, particularly in local poultry species [<a href="#pone.0213978.ref001" class="ref-tip">1</a>–<a href="#pone.0213978.ref003" class="ref-tip">3</a>]. As REV can be vertically transmitted through hatching eggs [<a href="#pone.0213978.ref004" class="ref-tip">4</a>], if REV-contaminated eggs are used to produce attenuated vaccines, vaccines can be contaminated by REV, which represents one of the crucial ways to disseminate REV [<a href="#pone.0213978.ref005" class="ref-tip">5</a>–<a href="#pone.0213978.ref007" class="ref-tip">7</a>]. Recently in China, the use of REV-contaminated attenuated vaccines is considered to be an important cause of REV infection [<a href="#pone.0213978.ref008" class="ref-tip">8</a>–<a href="#pone.0213978.ref010" class="ref-tip">10</a>].</p>
+<a id="article1.body1.sec1.p2" name="article1.body1.sec1.p2" class="link-target"></a><p>To overcome this problem, as the Ministry of Agriculture of China stipulated, all attenuated poultry vaccines must use SPF chickens as raw materials to produce attenuated vaccines, and all vaccine producers must confirm whether SPF chickens are infected by REV or not using sampled serum antibody detection. However, because of the specificity of housing standards in SPF poultry farms, others cannot freely enter a breeding area for sampling and detection. In this current study, we attempted to replace antibody detection in serum with antibody detection in egg yolks of SPF chickens.</p>
+</div>
+
+<div id="section2" class="section toc-section"><a id="sec002" name="sec002" data-toc="sec002" class="link-target" title="Results"></a><h2>Results</h2>
+<div id="section1" class="section toc-section"><a id="sec003" name="sec003" class="link-target" title="Determination of the optimal yolk dilution"></a>
+<h3>Determination of the optimal yolk dilution</h3>
+<a id="article1.body1.sec2.sec1.p1" name="article1.body1.sec2.sec1.p1" class="link-target"></a><p>Under the same conditions, we measured REV antibody titers in paired yolk and serum samples collected on the same day or one day before or after in 40 SPF chickens during the initial egg-laying stage when the chickens were 23 weeks old. <a href="#pone-0213978-t001">Table 1</a> shows the “goodness of fit†between yolk antibody titers diluted to various concentrations and serum antibody titers at the required concentration. By comparison, we found that REV antibody detection in the yolk at a 1:300 dilution had the highest goodness of fit with serum antibody measurements, and reached 97.5%.</p>
+<a class="link-target" id="pone-0213978-t001" name="pone-0213978-t001"></a><div class="figure" data-doi="10.1371/journal.pone.0213978.t001"><div class="img-box"><a title="Click for larger image" href="article/figure/image?size=medium&amp;id=info:doi/10.1371/journal.pone.0213978.t001" data-doi="info:doi/10.1371/journal.pone.0213978" data-uri="info:doi/10.1371/journal.pone.0213978.t001"><img src="article/figure/image?size=inline&amp;id=info:doi/10.1371/journal.pone.0213978.t001" alt="thumbnail" class="thumbnail"></a><div class="expand"></div></div><div class="figure-inline-download">
+ Download:
+ <ul><li><div class="definition-label"><a href="article/figure/powerpoint?id=info:doi/10.1371/journal.pone.0213978.t001">
+ PPT
+ </a></div><a href="article/figure/powerpoint?id=info:doi/10.1371/journal.pone.0213978.t001">
+ PowerPoint slide
+ </a></li><li><div class="definition-label"><a href="article/figure/image?download&amp;size=large&amp;id=info:doi/10.1371/journal.pone.0213978.t001">
+ PNG
+ </a></div><a href="article/figure/image?download&amp;size=large&amp;id=info:doi/10.1371/journal.pone.0213978.t001">
+ larger image
+ </a></li><li><div class="definition-label"><a href="article/figure/image?download&amp;size=original&amp;id=info:doi/10.1371/journal.pone.0213978.t001">
+ TIFF
+ </a></div><a href="article/figure/image?download&amp;size=original&amp;id=info:doi/10.1371/journal.pone.0213978.t001">
+ original image
+ </a></li></ul></div><div class="figcaption"><span>Table 1. </span> Consistent yolk and serum antibody measurements with different dilutions of yolk.</div><p class="caption_target"></p><p class="caption_object"><a href="https://doi.org/10.1371/journal.pone.0213978.t001">
+ https://doi.org/10.1371/journal.pone.0213978.t001</a></p></div></div>
+
+<div id="section2" class="section toc-section"><a id="sec004" name="sec004" class="link-target" title="Comparison of the goodness of fit for ALV-Ab antibody measurements in serum and yolk from SPF chickens of different ages"></a>
+<h3>Comparison of the goodness of fit for ALV-Ab antibody measurements in serum and yolk from SPF chickens of different ages</h3>
+<a id="article1.body1.sec2.sec2.p1" name="article1.body1.sec2.sec2.p1" class="link-target"></a><p>In 25–34-week-old chickens, serum and hatching eggs were sampled once per week, and a total of 720 serum samples and 720 yolk samples were collected from 40 SPF infected chickens and 32 SPF chickens without virus challenge. <a href="#pone-0213978-t002">Table 2</a> showed that the yolk antibody findings were completely consistent with those based on serum antibody detection within 10 weeks, as the serum antibody-positive chickens were all yolk antibody-positive, and the serum antibody-negative chickens were all yolk antibody-negative. Additionally, 35 of 40 SPF chickens challenged with REV alone were always REV antibody-positive in the serum and yolk, while 4 were always REV antibody-negative. All 32 SPF chickens without virus challenge were always REV antibody-positive in the serum and yolk. The goodness of fit for serum antibody and yolk antibody detection reached 100%.</p>
+<a class="link-target" id="pone-0213978-t002" name="pone-0213978-t002"></a><div class="figure" data-doi="10.1371/journal.pone.0213978.t002"><div class="img-box"><a title="Click for larger image" href="article/figure/image?size=medium&amp;id=info:doi/10.1371/journal.pone.0213978.t002" data-doi="info:doi/10.1371/journal.pone.0213978" data-uri="info:doi/10.1371/journal.pone.0213978.t002"><img src="article/figure/image?size=inline&amp;id=info:doi/10.1371/journal.pone.0213978.t002" alt="thumbnail" class="thumbnail"></a><div class="expand"></div></div><div class="figure-inline-download">
+ Download:
+ <ul><li><div class="definition-label"><a href="article/figure/powerpoint?id=info:doi/10.1371/journal.pone.0213978.t002">
+ PPT
+ </a></div><a href="article/figure/powerpoint?id=info:doi/10.1371/journal.pone.0213978.t002">
+ PowerPoint slide
+ </a></li><li><div class="definition-label"><a href="article/figure/image?download&amp;size=large&amp;id=info:doi/10.1371/journal.pone.0213978.t002">
+ PNG
+ </a></div><a href="article/figure/image?download&amp;size=large&amp;id=info:doi/10.1371/journal.pone.0213978.t002">
+ larger image
+ </a></li><li><div class="definition-label"><a href="article/figure/image?download&amp;size=original&amp;id=info:doi/10.1371/journal.pone.0213978.t002">
+ TIFF
+ </a></div><a href="article/figure/image?download&amp;size=original&amp;id=info:doi/10.1371/journal.pone.0213978.t002">
+ original image
+ </a></li></ul></div><div class="figcaption"><span>Table 2. </span> Agreement of yolk and serum antibody measurements with different dilutions of yolk.</div><p class="caption_target"></p><p class="caption_object"><a href="https://doi.org/10.1371/journal.pone.0213978.t002">
+ https://doi.org/10.1371/journal.pone.0213978.t002</a></p></div></div>
+
+<div id="section3" class="section toc-section"><a id="sec005" name="sec005" class="link-target" title="REV antibody detection in serum and yolk from different SPF chicken populations"></a>
+<h3>REV antibody detection in serum and yolk from different SPF chicken populations</h3>
+<a id="article1.body1.sec2.sec3.p1" name="article1.body1.sec2.sec3.p1" class="link-target"></a><p>A total of 1000 yolk samples and 1000 serum samples from 10 different SPF chicken populations were detected for REV antibody. <a href="#pone-0213978-t003">Table 3</a> showed that all samples tested were negative based on yolk and serum antibody detection. Our evaluation results were consistent and without false positive results, indicating that the test SPF chicken populations were not infected by REV.</p>
+<a class="link-target" id="pone-0213978-t003" name="pone-0213978-t003"></a><div class="figure" data-doi="10.1371/journal.pone.0213978.t003"><div class="img-box"><a title="Click for larger image" href="article/figure/image?size=medium&amp;id=info:doi/10.1371/journal.pone.0213978.t003" data-doi="info:doi/10.1371/journal.pone.0213978" data-uri="info:doi/10.1371/journal.pone.0213978.t003"><img src="article/figure/image?size=inline&amp;id=info:doi/10.1371/journal.pone.0213978.t003" alt="thumbnail" class="thumbnail"></a><div class="expand"></div></div><div class="figure-inline-download">
+ Download:
+ <ul><li><div class="definition-label"><a href="article/figure/powerpoint?id=info:doi/10.1371/journal.pone.0213978.t003">
+ PPT
+ </a></div><a href="article/figure/powerpoint?id=info:doi/10.1371/journal.pone.0213978.t003">
+ PowerPoint slide
+ </a></li><li><div class="definition-label"><a href="article/figure/image?download&amp;size=large&amp;id=info:doi/10.1371/journal.pone.0213978.t003">
+ PNG
+ </a></div><a href="article/figure/image?download&amp;size=large&amp;id=info:doi/10.1371/journal.pone.0213978.t003">
+ larger image
+ </a></li><li><div class="definition-label"><a href="article/figure/image?download&amp;size=original&amp;id=info:doi/10.1371/journal.pone.0213978.t003">
+ TIFF
+ </a></div><a href="article/figure/image?download&amp;size=original&amp;id=info:doi/10.1371/journal.pone.0213978.t003">
+ original image
+ </a></li></ul></div><div class="figcaption"><span>Table 3. </span> Detection of REV antibody from 10 SPF chicken flocks in China (random collection).</div><p class="caption_target"></p><p class="caption_object"><a href="https://doi.org/10.1371/journal.pone.0213978.t003">
+ https://doi.org/10.1371/journal.pone.0213978.t003</a></p></div></div>
+</div>
+
+<div id="section3" class="section toc-section"><a id="sec006" name="sec006" data-toc="sec006" class="link-target" title="Discussion"></a><h2>Discussion</h2><a id="article1.body1.sec3.p1" name="article1.body1.sec3.p1" class="link-target"></a><p>Recently, epidemiological surveys have shown that different Chinese chicken populations are frequently infected by REV, especially in local Chinese chicken species [<a href="#pone.0213978.ref001" class="ref-tip">1</a>–<a href="#pone.0213978.ref003" class="ref-tip">3</a>]. To control REV infection, many measures have been employed, including the use of attenuated vaccines without REV contamination. In China and other countries, the possibility of REV contamination in attenuated poultry vaccines has been a major concern for many years. Many REV infections are thought to be caused by REV infection in contaminated attenuated vaccines, particularly for the most frequently used fowlpox virus vaccine (FPV) and anti-Marek’s Disease vaccines [<a href="#pone.0213978.ref005" class="ref-tip">5</a>–<a href="#pone.0213978.ref013" class="ref-tip">13</a>]. Additionally, the capability of REV to integrate into the genome of other viruses complicates its diagnosis and prevention [<a href="#pone.0213978.ref014" class="ref-tip">14</a>–<a href="#pone.0213978.ref021" class="ref-tip">21</a>]. Awad <em>et al</em>. detected REV in contaminated FPV vaccine using PCR identification and REV antibody detection for virus isolation and identification in vaccinated SPF chickens[<a href="#pone.0213978.ref007" class="ref-tip">7</a>]. REV contamination in avian attenuated vaccines can lead to serious consequences, such as a significant reduction in antibody levels in vaccine-immunized chicken populations[<a href="#pone.0213978.ref022" class="ref-tip">22</a>].</p>
+<a id="article1.body1.sec3.p2" name="article1.body1.sec3.p2" class="link-target"></a><p>The REV contamination in attenuated vaccines may occur during the production process, but the use of REV-contaminated chicken embryos as raw materials is always the main cause. The national standards of China specify that vaccine production enterprises or SPF chicken breeding manufactures must periodically measure REV antibody levels in SPF chicken serum to evaluate the REV cleanliness in specific flocks. Because of differences in SPF chicken breeding environments, other individuals should not be allowed to enter a SPF chicken breeding area for sampling. This current approach causes both stress responses in SPF chickens and introduces the risk of false results for SPF chicken serum tests resulting from the inspection process. Therefore, the Ministry of Agriculture of China asked whether yolk antibody detection in hatching eggs could be used as a substitute for serum antibody detection to evaluate exogenous virus contamination in SPF chicken embryos.</p>
+<a id="article1.body1.sec3.p3" name="article1.body1.sec3.p3" class="link-target"></a><p>The yolk dilution has a strong influence on the antibody detection results, as excessive high yolk concentration is prone to yield false negative or false positive results. The results of this present study showed that yolk at a 1:300 dilution gave the best goodness of fit between the antibody-negative or positive results based on yolk or serum antibody detection. To precisely and scientifically reveal the correlation between the yolk and serum antibody detection, we compared REV antibody detection results in the yolk and serum of 72 SPF chickens (40 were inoculated with REV one month prior to egg-laying) for 10 consecutive weeks. We found that for the 72 chickens, serum antibody detection results coincided with yolk antibody results at a rate of 100%. Our findings indicate that it is feasible to replace serum antibody tests with yolk antibody detection to monitor REV infection in SPF chickens.</p>
+<a id="article1.body1.sec3.p4" name="article1.body1.sec3.p4" class="link-target"></a><p>At the optimal dilution determined in this study, a total of 1000 yolk samples and 1000 serum samples from 10 separate SPF chicken populations were tested for REV antibodies, and all showed negative results. The results of undetected antibodies showed that these chickens were not infected with REV or that although these chickens were infected with REV, not enough antibodies were detected. In order to avoid the false negative, we consider that chickens repeatedly tested negatively are not infected with REV, which is very important in flock surveillance. Additionally, detection results that used both methods were fully consistent. Importantly, no false positive results were obtained. These robust results indicate that contemporary SPF chicken embryos in China are mostly or fully not contaminated by REV. Our findings suggest that vaccine production enterprises could evaluate the REV cleanliness of SPF chicken farms by detecting antibodies in the yolk of SPF eggs. This process not only reduces the stress responses of SPF chickens during serum sampling and provides convenience for sampling, it also yields more reliable samples. Indeed, compared with serum sample results, hatching egg-based data are less prone to human error.</p>
+</div>
+
+<div id="section4" class="section toc-section"><a id="sec007" name="sec007" data-toc="sec007" class="link-target" title="Materials and methods"></a><h2>Materials and methods</h2>
+<div id="section1" class="section toc-section"><a id="sec008" name="sec008" class="link-target" title="REV strain"></a>
+<h3>REV strain</h3>
+<a id="article1.body1.sec4.sec1.p1" name="article1.body1.sec4.sec1.p1" class="link-target"></a><p>The strain REV-HA9901 was isolated in 1999 and full-length genomic sequencing had been completed (GenBank Accession No. AY842951) [<a href="#pone.0213978.ref023" class="ref-tip">23</a>]. Supernatants of the pre-frozen virus cells at –80°C were used to calculate TCID<sub>50</sub> by the Karber method; 0.1 mL supernatant of CEF cells contained 10 <sup>4.5</sup> TCID<sub>50</sub>.</p>
+</div>
+
+<div id="section2" class="section toc-section"><a id="sec009" name="sec009" class="link-target" title="Rearing and virus challenge of SPF chickens"></a>
+<h3>Rearing and virus challenge of SPF chickens</h3>
+<a id="article1.body1.sec4.sec2.p1" name="article1.body1.sec4.sec2.p1" class="link-target"></a><p>A total of 40 nineteen-week-old SPF chickens were purchased from SPAFAS Poultry Co., and were reared in HEPA-filtered negative-pressure isolators. At nineteen weeks of age, groups of 13, 14, and 13 chickens were vaccinated with 10<sup>3</sup> TCID<sub>50</sub> HA9901, 10<sup>4</sup> TCID<sub>50</sub> HA9901, and 10<sup>5</sup> TCID<sub>50</sub> of HA9901, respectively. All labeled chickens were separately raised within a single cage in an SPF animal feeding unit so that eggs and serum samples could corresponded 1:1 with chickens. A total of 32 SPF chickens in the same batch were reared in isolation environments as a negative control. All these chickens from each group were sacrificed by intravenous administration of barbiturates. The use of all laboratory animals in this study was approved by the scientific ethical committee of Shandong province.</p>
+</div>
+
+<div id="section3" class="section toc-section"><a id="sec010" name="sec010" class="link-target" title="Determination of the optimal yolk dilution"></a>
+<h3>Determination of the optimal yolk dilution</h3>
+<a id="article1.body1.sec4.sec3.p1" name="article1.body1.sec4.sec3.p1" class="link-target"></a><p>The 40 inoculated SPF chickens all began laying eggs when 23-weeks-old, and the hatching eggs and serum samples were collected from each chicken. Serum samples were diluted to the optimal concentration in accordance with the instructions of the ELISA test kit for REV antibody (IDEXX Company); and yolk samples were diluted to 1:150, 1:200, 1:300, and 1:400. To minimize the possibility of human errors, paired serum and yolk from each chicken were tested using the same kit by the same laboratory staff in simultaneous ELISA experiments with identical conditions. Each sample was tested twice, and if the two values differed greatly the test was repeated. Based on these results, we determined the optimal dilution of yolk at which the detection was in accordance with that determined based on serum antibody detection.</p>
+</div>
+
+<div id="section4" class="section toc-section"><a id="sec011" name="sec011" class="link-target" title="REV antibody detection in serum and yolk among chickens of different ages"></a>
+<h3>REV antibody detection in serum and yolk among chickens of different ages</h3>
+<a id="article1.body1.sec4.sec4.p1" name="article1.body1.sec4.sec4.p1" class="link-target"></a><p>Each week, paired egg and serum samples from each chicken were collected from 72 SPF chickens for 10 weeks from the age of 25 to 34 weeks old. If a chicken did not lay eggs on the blood-collecting day, the egg laid one day before or after the blood collection was used. For REV antibody detection, serum samples were diluted according to the manufacturer’s instructions and yolk samples were diluted in accord with the optimal dilution determined in Section 1.3. To minimize the possibility of human errors, paired serum and yolk from each chicken were tested using the same batch of kits by the same laboratory staff in simultaneous ELISA experiments with identical conditions. Each sample was tested twice, and if the two values differed greatly, tests were repeated. Finally, we compared the “goodness of fit†between the yolk antibody sampled during different stages and serum antibody measurements.</p>
+</div>
+
+<div id="section5" class="section toc-section"><a id="sec012" name="sec012" class="link-target" title="REV antibody detection in the serum and yolk of different SPF chicken populations"></a>
+<h3>REV antibody detection in the serum and yolk of different SPF chicken populations</h3>
+<a id="article1.body1.sec4.sec5.p1" name="article1.body1.sec4.sec5.p1" class="link-target"></a><p>Paired egg and serum samples from each chicken were sampled from 10 distinct Chinese SPF chicken populations. Serum samples were diluted in accordance with the test kit manufacturer’s instructions (IDEXX Company), and yolk samples were diluted in accordance with the optimal dilution that was determined. We separately estimated the REV cleanliness for different SPF chicken populations based on the two previously described examination methods, and compared differences in the actual operation. To minimize the introduction of human errors, paired serum and yolk samples from a chicken were tested using the same batch of kits by the same laboratory staff in simultaneous ELISA experiments with identical conditions. Each sample was tested twice, and if the two values differed greatly the tests were repeated.</p>
+</div>
+</div>
+
+
+
+
+
+<div class="section toc-section"><a id="ack" name="ack" data-toc="ack" title="Acknowledgments" class="link-target"></a><h2>Acknowledgments</h2>
+<a id="article1.back1.ack1.p1" name="article1.back1.ack1.p1" class="link-target"></a><p>The research was supported by the National Quality Infrastructure of China (2017YFF0210200).</p>
+</div><div class="toc-section"><a id="references" name="references" class="link-target" data-toc="references" title="References"></a><h2>References</h2><ol class="references"><li id="ref1"><span class="order">1.
+ </span><a name="pone.0213978.ref001" id="pone.0213978.ref001" class="link-target"></a>Cheng Z., Shi Y., Zhan L., Zhu G., Diao X. and Cui Z. (2007) Occurrence of reticuloendotheliosis in Chinese partridge. J Vet Med Sci. 69(12): 1295–1298. pmid:18176029 <ul class="reflinks"><li><a href="#" data-author="Cheng" data-cit="ChengZ.%2C%20ShiY.%2C%20ZhanL.%2C%20ZhuG.%2C%20DiaoX.%20and%20CuiZ.%20%282007%29%20Occurrence%20of%20reticuloendotheliosis%20in%20Chinese%20partridge.%20J%20Vet%20Med%20Sci.%2069%2812%29%3A%201295%E2%80%931298.%2018176029" data-title="Occurrence%20of%20reticuloendotheliosis%20in%20Chinese%20partridge" target="_new" title="Go to article in CrossRef">
+ View Article
+ </a></li><li><a href="http://www.ncbi.nlm.nih.gov/pubmed/18176029" target="_new" title="Go to article in PubMed">
+ PubMed/NCBI
+ </a></li><li><a href="http://scholar.google.com/scholar?q=Occurrence+of+reticuloendotheliosis+in+Chinese+partridge+Cheng+2007" target="_new" title="Go to article in Google Scholar">
+ Google Scholar
+ </a></li></ul></li><li id="ref2"><span class="order">2.
+ </span><a name="pone.0213978.ref002" id="pone.0213978.ref002" class="link-target"></a>Cui Z., Sun S., Zhang Z., and Meng S. (2009) Simultaneous endemic infections with subgroup J avian leukosis virus and reticuloendotheliosis virus in commercial and local breeds of chickens. Avian Pathol. 38(6): 443–448. pmid:19937533 <ul class="reflinks" data-doi="10.1080/03079450903349188"><li><a href="https://doi.org/10.1080/03079450903349188" data-author="doi-provided" data-cit="doi-provided" data-title="doi-provided" target="_new" title="Go to article">
+ View Article
+ </a></li><li><a href="http://www.ncbi.nlm.nih.gov/pubmed/19937533" target="_new" title="Go to article in PubMed">
+ PubMed/NCBI
+ </a></li><li><a href="http://scholar.google.com/scholar?q=Simultaneous+endemic+infections+with+subgroup+J+avian+leukosis+virus+and+reticuloendotheliosis+virus+in+commercial+and+local+breeds+of+chickens+Cui+2009" target="_new" title="Go to article in Google Scholar">
+ Google Scholar
+ </a></li></ul></li><li id="ref3"><span class="order">3.
+ </span><a name="pone.0213978.ref003" id="pone.0213978.ref003" class="link-target"></a>Zhao P., Ma C., Du Y., and Cui Z. (2012) Serological survey of the Reticuloendotheliosis virus infection in China native chicken flocks. Pak Vet J. 32:621–623. <ul class="reflinks"><li><a href="#" data-author="Zhao" data-cit="ZhaoP.%2C%20MaC.%2C%20DuY.%2C%20and%20CuiZ.%20%282012%29%20Serological%20survey%20of%20the%20Reticuloendotheliosis%20virus%20infection%20in%20China%20native%20chicken%20flocks.%20Pak%20Vet%20J.%2032%3A621%E2%80%93623." data-title="Serological%20survey%20of%20the%20Reticuloendotheliosis%20virus%20infection%20in%20China%20native%20chicken%20flocks" target="_new" title="Go to article in CrossRef">
+ View Article
+ </a></li><li><a href="http://scholar.google.com/scholar?q=Serological+survey+of+the+Reticuloendotheliosis+virus+infection+in+China+native+chicken+flocks+Zhao+2012" target="_new" title="Go to article in Google Scholar">
+ Google Scholar
+ </a></li></ul></li><li id="ref22"><span class="order">22.
+ </span><a name="pone.0213978.ref022" id="pone.0213978.ref022" class="link-target"></a>Witter R. L., Lee L. F., Bacon L. D. and Smith E. J. (1979) Depression of vaccinal immunity to Marek’s disease by infection with reticuloendotheliosis virus. Infection and Immunity. 26:90–98. pmid:227800 <ul class="reflinks"><li><a href="#" data-author="Witter" data-cit="WitterR.%20L.%2C%20LeeL.%20F.%2C%20BaconL.%20D.%20and%20SmithE.%20J.%20%281979%29%20Depression%20of%20vaccinal%20immunity%20to%20Marek%E2%80%99s%20disease%20by%20infection%20with%20reticuloendotheliosis%20virus.%20Infection%20and%20Immunity.%2026%3A90%E2%80%9398.%20227800" data-title="Depression%20of%20vaccinal%20immunity%20to%20Marek%E2%80%99s%20disease%20by%20infection%20with%20reticuloendotheliosis%20virus" target="_new" title="Go to article in CrossRef">
+ View Article
+ </a></li><li><a href="http://www.ncbi.nlm.nih.gov/pubmed/227800" target="_new" title="Go to article in PubMed">
+ PubMed/NCBI
+ </a></li><li><a href="http://scholar.google.com/scholar?q=Depression+of+vaccinal+immunity+to+Marek%E2%80%99s+disease+by+infection+with+reticuloendotheliosis+virus+Witter+1979" target="_new" title="Go to article in Google Scholar">
+ Google Scholar
+ </a></li></ul></li><li id="ref23"><span class="order">23.
+ </span><a name="pone.0213978.ref023" id="pone.0213978.ref023" class="link-target"></a>Wang Y., Cui Z. and Jiang S. (2005) Sequencing and analysis of whole genome nucleotide sequence of Chinese REV isolate HA9901. Science in China Serices C: Life Sciences. 35:340–380. <ul class="reflinks"><li><a href="#" data-author="Wang" data-cit="WangY.%2C%20CuiZ.%20and%20JiangS.%20%282005%29%20Sequencing%20and%20analysis%20of%20whole%20genome%20nucleotide%20sequence%20of%20Chinese%20REV%20isolate%20HA9901.%20Science%20in%20China%20Serices%20C%3A%20Life%20Sciences.%2035%3A340%E2%80%93380." data-title="Sequencing%20and%20analysis%20of%20whole%20genome%20nucleotide%20sequence%20of%20Chinese%20REV%20isolate%20HA9901" target="_new" title="Go to article in CrossRef">
+ View Article
+ </a></li><li><a href="http://scholar.google.com/scholar?q=Sequencing+and+analysis+of+whole+genome+nucleotide+sequence+of+Chinese+REV+isolate+HA9901+Wang+2005" target="_new" title="Go to article in Google Scholar">
+ Google Scholar
+ </a></li></ul></li></ol></div>
+
+
+
+ <div class="ref-tooltip">
+ <div class="ref_tooltip-content">
+
+ </div>
+ </div>
+
+ </div>
+ </div>
+ </div>
+
+ </section>
+ <aside class="article-aside">
+
+
+<!--[if IE 9]>
+<style>
+.dload-xml {margin-top: 38px}
+</style>
+<![endif]-->
+<div class="dload-menu">
+ <div class="dload-pdf">
+ <a href="/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ id="downloadPdf" target="_blank">Download PDF</a>
+ </div>
+ <div data-js-tooltip-hover="trigger" class="dload-hover">&nbsp;
+ <ul class="dload-xml" data-js-tooltip-hover="target">
+ <li><a href="/plosone/article/citation?id=10.1371/journal.pone.0213978"
+ id="downloadCitation">Citation</a></li>
+ <li><a href="/plosone/article/file?id=10.1371/journal.pone.0213978&type=manuscript"
+ id="downloadXml">XML</a>
+ </li>
+ </ul>
+
+ </div>
+</div>
+
+<div class="aside-container">
+
+<div class="print-article" id="printArticle" data-js-tooltip-hover="trigger">
+
+ Print
+ <ul class="print-options" data-js-tooltip-hover="target">
+ <li>
+ <a href="#" onclick="window.print(); return false;" class="preventDefault" id="printBrowser" title="Print
+ Article">Print article</a>
+ </li>
+
+
+
+
+<li>
+<a title="Odyssey Press" href="https://www.odysseypress.com/onlinehost/reprint_order.php?type=A&amp;page=0&amp;journal=7&amp;doi=10.1371%2Fjournal.pone.0213978&amp;volume=&amp;issue=&amp;title=Assessment%20on%20reticuloendotheliosis%20virus%20infection%20in%20specific-pathogen-free%20chickens%20based%20on%20detection%20of%20yolk%20antibody&amp;author_name=Yang%20Li%2C%20Tuanjie%20Wang%2C%20Lin%20Wang%2C%20Mingjun%20Sun%2C%20Zhizhong%20Cui%2C%20Shuang%20Chang%2C%20Yongping%20Wu%2C%20Xiaodong%20Zhang%2C%20Xiaohui%20Yu%2C%20Tao%20Sun%2C%20Peng%20Zhao&amp;start_page=1&amp;end_page=7">EzReprint </a>
+</li>
+
+ </ul>
+</div>
+<div class="share-article" id="shareArticle" data-js-tooltip-hover="trigger">
+ Share
+ <ul data-js-tooltip-hover="target" class="share-options" id="share-options">
+
+<li><a href="https://www.reddit.com/submit?url=https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978" id="shareReddit" target="_blank" title="Submit to Reddit"><img src="/plosone/resource/img/icon.reddit.16.png" width="16" height="16" alt="Reddit">Reddit</a></li>
+
+<li><a href="https://plus.google.com/share?url=https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978" id="shareGoogle" target="_blank" title="Share on Google+"><img src="/plosone/resource/img/icon.gplus.16.png" width="16" height="16" alt="Google+">Google+</a></li>
+
+<li><a href="https://www.facebook.com/share.php?u=https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978&t=Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody" id="shareFacebook" target="_blank" title="Share on Facebook"><img src="/plosone/resource/img/icon.fb.16.png" width="16" height="16" alt="Facebook">Facebook</a></li>
+
+<li><a href="https://www.linkedin.com/shareArticle?url=https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978&title=Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody&summary=Checkout this article I found at PLOS" id="shareLinkedIn" target="_blank" title="Add to LinkedIn"><img src="/plosone/resource/img/icon.linkedin.16.png" width="16" height="16" alt="LinkedIn">LinkedIn</a></li>
+
+<li><a href="https://www.mendeley.com/import/?url=https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978" id="shareMendeley" target="_blank" title="Add to Mendeley"><img src="/plosone/resource/img/icon.mendeley.16.png" width="16" height="16" alt="Mendeley">Mendeley</a></li>
+
+<li><a href="https://www.pubchase.com/library?add_aid=10.1371/journal.pone.0213978&source=plos" id="sharePubChase" target="_blank" title="Add to PubChase"><img src="/plosone/resource/img/icon.pc.16.png" width="16" height="16" alt="PubChase">PubChase</a></li>
+
+ <li><a href="https://twitter.com/intent/tweet?url=https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978&text=%23PLOSONE%3A%20Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody" target="_blank" title="share on Twitter" id="twitter-share-link"><img src="/plosone/resource/img/icon.twtr.16.png" width="16" height="16" alt="Twitter">Twitter</a></li>
+
+<li><a href="mailto:?subject=Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody&body=I%20thought%20you%20would%20find%20this%20article%20interesting.%20From%20PLOS ONE:%20https%3A%2F%2Fdx.plos.org%2F10.1371%2Fjournal.pone.0213978" id="shareEmail" rel="noreferrer" aria-label="Email"><img src="/plosone/resource/img/icon.email.16.png" width="16" height="16" alt="Email">Email</a></li>
+ </ul>
+</div></div>
+
+
+ <!-- Crossmark 2.0 widget -->
+<script src="https://crossmark-cdn.crossref.org/widget/v2.0/widget.js"></script>
+<a data-target="crossmark"><img width="150" src="https://crossmark-cdn.crossref.org/widget/v2.0/logos/CROSSMARK_BW_horizontal.svg"></a>
+<!-- End Crossmark 2.0 widget -->
+
+
+
+
+
+
+
+<div class="skyscraper-container">
+ <div class="title">Advertisement</div>
+<!-- DoubleClick Ad Zone -->
+ <div class='advertisement' id='div-gpt-ad-1458247671871-1' style='width:160px; height:600px;'>
+ <script type='text/javascript'>
+ googletag.cmd.push(function() { googletag.display('div-gpt-ad-1458247671871-1'); });
+ </script>
+ </div>
+</div>
+
+
+
+
+<div class="subject-areas-container">
+ <h3>Subject Areas <div id="subjInfo">?</div>
+ <div id="subjInfoText">
+ <p>For more information about PLOS Subject Areas, click
+ <a href="https://github.com/PLOS/plos-thesaurus/blob/develop/README.md" target="_blank" title="Link opens in new window">here</a>.</p>
+ <span class="inline-intro">We want your feedback.</span> Do these Subject Areas make sense for this article? Click the target next to the incorrect Subject Area and let us know. Thanks for your help!
+
+
+ </div>
+ </h3>
+ <ul id="subjectList">
+ <li>
+ <a class="taxo-term" title="Search for articles about Chickens"
+ href="/plosone/search?filterSubjects=Chickens&filterJournals=PLoSONE&q=">Chickens</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Chickens"><p class="taxo-explain">Is the Subject Area <strong>"Chickens"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Antibodies"
+ href="/plosone/search?filterSubjects=Antibodies&filterJournals=PLoSONE&q=">Antibodies</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Antibodies"><p class="taxo-explain">Is the Subject Area <strong>"Antibodies"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Livestock"
+ href="/plosone/search?filterSubjects=Livestock&filterJournals=PLoSONE&q=">Livestock</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Livestock"><p class="taxo-explain">Is the Subject Area <strong>"Livestock"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Attenuated vaccines"
+ href="/plosone/search?filterSubjects=Attenuated+vaccines&filterJournals=PLoSONE&q=">Attenuated vaccines</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Attenuated vaccines"><p class="taxo-explain">Is the Subject Area <strong>"Attenuated vaccines"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Enzyme-linked immunoassays"
+ href="/plosone/search?filterSubjects=Enzyme-linked+immunoassays&filterJournals=PLoSONE&q=">Enzyme-linked immunoassays</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Enzyme-linked immunoassays"><p class="taxo-explain">Is the Subject Area <strong>"Enzyme-linked immunoassays"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Poultry"
+ href="/plosone/search?filterSubjects=Poultry&filterJournals=PLoSONE&q=">Poultry</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Poultry"><p class="taxo-explain">Is the Subject Area <strong>"Poultry"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Animal sexual behavior"
+ href="/plosone/search?filterSubjects=Animal+sexual+behavior&filterJournals=PLoSONE&q=">Animal sexual behavior</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Animal sexual behavior"><p class="taxo-explain">Is the Subject Area <strong>"Animal sexual behavior"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ <li>
+ <a class="taxo-term" title="Search for articles about Vaccines"
+ href="/plosone/search?filterSubjects=Vaccines&filterJournals=PLoSONE&q=">Vaccines</a>
+ <span class="taxo-flag">&nbsp;</span>
+ <div class="taxo-tooltip" data-categoryname="Vaccines"><p class="taxo-explain">Is the Subject Area <strong>"Vaccines"</strong> applicable to this article?
+ <button id="noFlag" data-action="remove">Yes</button>
+ <button id="flagIt" value="flagno" data-action="add">No</button></p>
+ <p class="taxo-confirm">Thanks for your feedback.</p>
+ </div>
+ </li>
+ </ul>
+</div>
+<div id="subjectErrors"></div>
+
+
+<div class="twitter-container">
+ <h3>Archived Tweets</h3>
+ <ul id="tweetList">
+
+ </ul>
+ <div class="load-more">Load more <span></span></div>
+ <div class="view-all"><a href="https://alm.plos.org/works/doi.org/10.1371/journal.pone.0213978?source_id=twitter">View all tweets</a>
+ </div>
+</div>
+
+<script type="text/template" id="twitterModuleItemTemplate">
+ <% _.each(items, function(item) { %>
+ <li>
+ <div class="tweet-info">
+ <a href="https://twitter.com/<%= item.user %>">
+ <span class="imgholder">
+ <img class="imgLoad" src="<%= item.user_profile_image %>">
+ </span>
+ <div class="tweetDate"><%= item.created_at %></div>
+ <div class="tweetUser">
+ <strong><%= item.user_name %></strong>
+ <span>@<%= item.user %></span>
+ </div>
+ </a>
+ </div>
+ <div class="tweetText">
+ <%= item.text %>
+ </div>
+ <div id="tweetActions">
+ <a class="tweet-reply" href="https://twitter.com/intent/tweet?in_reply_to<%= item.id %>&amp;text=@<%= item.user %>">
+ <div>&nbsp;</div> Reply
+ </a>
+ <a class="tweet-retweet" href="https://twitter.com/intent/retweet?tweet_id=<%= item.id %>">
+ <div>&nbsp;</div> Retweet
+ </a>
+ <a class="tweet-favorite" href="https://twitter.com/intent/favorite?tweet_id=<%= item.id %>">
+ <div>&nbsp;</div> Favorite
+ </a>
+ </div>
+ </li>
+ <% }); %>
+</script>
+
+
+ </aside>
+</div>
+
+
+
+
+</main>
+
+<footer id="pageftr">
+ <div class="row">
+
+
+ <div class="block x-small">
+
+
+<ul class="nav nav-secondary">
+ <li class="ftr-header"><a href="https://www.plos.org/publications/journals/">Publications</a></li>
+ <li><a href="/plosbiology/" id="ftr-bio">PLOS Biology</a></li>
+ <li><a href="/plosmedicine/" id="ftr-med">PLOS Medicine</a></li>
+ <li><a href="/ploscompbiol/" id="ftr-compbio">PLOS Computational Biology</a></li>
+ <li><a href="/plosgenetics/" id="ftr-gen">PLOS Genetics</a></li>
+ <li><a href="/plospathogens/" id="ftr-path">PLOS Pathogens</a></li>
+ <li><a href="/plosone/" id="ftr-one">PLOS ONE</a></li>
+ <li><a href="/plosntds/" id="ftr-ntds">PLOS Neglected Tropical Diseases</a></li>
+ </ul>
+ </div>
+
+ <div class="block xx-small">
+
+
+<ul class="nav nav-tertiary">
+ <li>
+ <a href="https://www.plos.org" id="ftr-home">Home</a>
+ </li>
+ <li>
+ <a href="https://blogs.plos.org" id="ftr-blog">Blogs</a>
+ </li>
+ <li>
+ <a href="https://collections.plos.org" id="ftr-collections">Collections</a>
+ </li>
+ <li>
+ <a href="mailto:webmaster@plos.org" id="ftr-feedback">Give feedback</a>
+ </li>
+ <li>
+ <a href="/plosone/lockss-manifest" id="ftr-lockss">LOCKSS</a>
+ </li>
+</ul>
+ </div>
+ <div class="block xx-small">
+
+<ul class="nav nav-primary">
+ <li><a href="https://www.plos.org/privacy-policy" id="ftr-privacy">Privacy Policy</a></li>
+ <li><a href="https://www.plos.org/terms-of-use" id="ftr-terms">Terms of Use</a></li>
+ <li><a href="https://www.plos.org/advertise/" id="ftr-advertise">Advertise</a></li>
+ <li><a href="https://www.plos.org/media-inquiries" id="ftr-media">Media Inquiries</a></li>
+ <li><a href="https://www.plos.org/contact" id="ftr-contact">Contact</a></li>
+</ul>
+ </div>
+ <div class="block x-small">
+
+
+<p class="footer-non-profit-statement">PLOS is a nonprofit 501(c)(3) corporation, #C2354500, based in San Francisco, California, US</p> <img src="/plosone/resource/img/logo-plos-footer.png" alt="PLOS" class="logo-footer"/>
+ </div>
+
+
+
+&nbsp;
+<!--
+ Webapp build: 3.7.12 at 20191001163109 by teamcity, commit:
+ Service build: 2.5.4 at 20191001163228 by teamcity, commit:
+ Enabled dev features: []
+ -->
+
+ </div>
+
+
+
+</footer>
+
+
+
+
+<script type="text/javascript">
+ var ArticleData = {
+ doi: '10.1371/journal.pone.0213978',
+ title: '<article-title xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody<\/article-title>',
+ date: 'Apr 22, 2019'
+ };
+</script>
+
+<script type="text/javascript">
+ var ALM_CONFIG = ALM_CONFIG || {};
+ ALM_CONFIG.hostname = "https://alm.plos.org";
+ ALM_CONFIG.apiKey = "3pezRBRXdyzYW6ztfwft";
+ ALM_CONFIG.host = "https://alm.plos.org/api/v5/articles";
+</script>
+<script type="text/javascript">
+ var ALM_CONFIG = ALM_CONFIG || {};
+ ALM_CONFIG.hostname = "https://alm.plos.org";
+ ALM_CONFIG.apiKey = "3pezRBRXdyzYW6ztfwft";
+ ALM_CONFIG.host = "https://alm.plos.org/api/v5/articles";
+</script>
+
+
+
+
+
+
+<script type="text/javascript" async src="https://platform.twitter.com/widgets.js"></script>
+
+
+
+
+
+<!-- This file should be loaded before the renderJs, to avoid conflicts with the FigShare, that implements the MathJax also. -->
+
+<!-- mathjax configuration options -->
+<!-- more can be found at http://docs.mathjax.org/en/latest/ -->
+<script type="text/x-mathjax-config">
+MathJax.Hub.Config({
+ "HTML-CSS": {
+ scale: 100,
+ availableFonts: ["STIX","TeX"],
+ preferredFont: "STIX",
+ webFont: "STIX-Web",
+ linebreaks: { automatic: false }
+ },
+ jax: ["input/MathML", "output/HTML-CSS"]
+});
+</script>
+
+<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=MML_HTMLorMML"></script>
+
+<script src="/plosone/resource/compiled/asset_57YNUH6YLHJPCLJ7347ODA3HRPF472A4.js"></script>
+<div class="reveal-modal-bg"></div>
+</body>
+</html>
diff --git a/python/tests/files/scielo_article.jats.xml b/python/tests/files/scielo_article.jats.xml
new file mode 100644
index 0000000..08c864e
--- /dev/null
+++ b/python/tests/files/scielo_article.jats.xml
@@ -0,0 +1,336 @@
+<?xml version="1.0" encoding="ISO-8859-1"?><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+<front>
+<journal-meta>
+<journal-id>1683-9803</journal-id>
+<journal-title><![CDATA[Pediatría (Asunción)]]></journal-title>
+<abbrev-journal-title><![CDATA[Pediatr. (Asunción)]]></abbrev-journal-title>
+<issn>1683-9803</issn>
+<publisher>
+<publisher-name><![CDATA[Sociedad Paraguaya de Pediatría]]></publisher-name>
+</publisher>
+</journal-meta>
+<article-meta>
+<article-id>S1683-98032015000200002</article-id>
+<article-id pub-id-type="doi">10.18004/ped.2015.agosto.102-107</article-id>
+<title-group>
+<article-title xml:lang="es"><![CDATA[Prevalencia de desnutrición y hábitos alimentarios en niños menores de 5 años en las comunidades indígenas de Yby Yau y Azote’y, 2011]]></article-title>
+<article-title xml:lang="en"><![CDATA[Prevalence of malnutrition and eating habits in children under 5 years of age in indigenous communities in Azote'y and Yby Yau, 2011]]></article-title>
+</title-group>
+<contrib-group>
+<contrib contrib-type="author">
+<name>
+<surname><![CDATA[Ruiz Valiente]]></surname>
+<given-names><![CDATA[Syntia Carolina]]></given-names>
+</name>
+<xref ref-type="aff" rid="A01"/>
+</contrib>
+<contrib contrib-type="author">
+<name>
+<surname><![CDATA[Ruiz Cañete]]></surname>
+<given-names><![CDATA[Manuel]]></given-names>
+</name>
+<xref ref-type="aff" rid="A02"/>
+</contrib>
+<contrib contrib-type="author">
+<name>
+<surname><![CDATA[Cohene Velazquez]]></surname>
+<given-names><![CDATA[Bartola]]></given-names>
+</name>
+<xref ref-type="aff" rid="A03"/>
+</contrib>
+</contrib-group>
+<aff id="A01">
+<institution><![CDATA[,Hospital General Pediátrico Niños Acosta Ñu. Reducto-San Lorenzo, Paraguay ]]></institution>
+<addr-line><![CDATA[ ]]></addr-line>
+<country>Paraguay</country>
+</aff>
+<aff id="A02">
+<institution><![CDATA[,Hospital General Pediátrico Niños Acosta Ñu. Reducto-San Lorenzo, Paraguay ]]></institution>
+<addr-line><![CDATA[ ]]></addr-line>
+<country>Paraguay</country>
+</aff>
+<aff id="A03">
+<institution><![CDATA[,Puesto de Salud de Paso Tuya. Azote’y. Paraguay ]]></institution>
+<addr-line><![CDATA[ ]]></addr-line>
+<country>Paraguay</country>
+</aff>
+<pub-date pub-type="pub">
+<day>30</day>
+<month>08</month>
+<year>2015</year>
+</pub-date>
+<pub-date pub-type="epub">
+<day>30</day>
+<month>08</month>
+<year>2015</year>
+</pub-date>
+<volume>42</volume>
+<numero>2</numero>
+<fpage>102</fpage>
+<lpage>107</lpage>
+<copyright-statement/>
+<copyright-year/>
+<self-uri xlink:href="http://scielo.iics.una.py/scielo.php?script=sci_arttext&amp;pid=S1683-98032015000200002&amp;lng=en&amp;nrm=iso"></self-uri><self-uri xlink:href="http://scielo.iics.una.py/scielo.php?script=sci_abstract&amp;pid=S1683-98032015000200002&amp;lng=en&amp;nrm=iso"></self-uri><self-uri xlink:href="http://scielo.iics.una.py/scielo.php?script=sci_pdf&amp;pid=S1683-98032015000200002&amp;lng=en&amp;nrm=iso"></self-uri><abstract abstract-type="short" xml:lang="es"><p><![CDATA[Introducción: La infancia es una etapa trascendental en el desarrollo evolutivo del hombre, para lo cual es fundamental una adecuada nutrición. La desnutrición infantil no es solo un problema de falta de alimentos, es un conflicto social más profundo. La prevalencia de desnutrición en menores de 5 años del país es de 5,9% según datos del Instituto Nacional de Alimentación y Nutrición. Objetivo: Determinar la prevalencia de desnutrición y hábitos alimentarios en niños menores de 5 años de las comunidades indígenas de Yby Yaú y Azote’y. Materiales y Métodos: Estudio descriptivo, transversal, realizado de enero a abril del 2011, que identificó la prevalencia de desnutrición infantil en niños indígenas de las etnias Pa'i Tavyterã y Mbya Guaraní de 11 comunidades indígenas de Yby Yau y Azote’y. Fueron examinados 349 menores de 5 años de edad. Para la evaluación del estado nutricional se utilizó la curva de crecimiento de la OMS. Los niños/as fueron pesados/as en balanzas mecánicas. Para la medida de la altura, los mayores de dos años fueron medidos con el tallimetro y los menores de 2 años con cinta métrica. Resultados: Se observó desnutrición en 53 niños que equivale al 15% de la muestra. De estos 60,4% padecían de desnutrición moderada y 39,6% desnutrición grave. El mayor porcentaje de desnutrición se encontró en el grupo de edad de 0 a 24 meses con 71,6%. El 77% de los niños tenían desnutrición crónica. Conclusiones: La prevalencia de desnutrición en indígenas en Yby Yaú y Azote’y es de 15%, lo que sobrepasa los índices de desnutrición en menores de 5 años del país.]]></p></abstract>
+<abstract abstract-type="short" xml:lang="en"><p><![CDATA[Introduction: Childhood is a crucial stage in the development of humans, which is why proper nutrition is essential for this stage. Child malnutrition is not just a problem of lack of food, it is rooted in deeper social problems. The prevalence of malnutrition in children under five years of age in Paraguay is 5.9% , according to the Paraguayan National Institute of Food and Nutrition. Objective: Determine the prevalence of malnutrition and the eating habits in children under five years of age in indigenous communities in the towns of Azote'y and Yaú Yby. Materials and Methods: This was a descriptive, cross-sectional study conducted from January to April 2011, which identified the prevalence of child malnutrition in indigenous children in 11 ethnic Pa'i Tavyterá and Mbya Guarani indigenous communities in Azote'y and Yby Yau. We examined 349 children under 5 years of age. The World Health Organization (WHO) growth charts were used to assess nutritional status. Children were weighed with mechanical scales. To measure height, children two and older were measured with a stadiometer and children younger than two were measured with tape. Results: Malnutrition was observed in 53 children (15% of the sample). Of these, 60.4% were suffering from moderate malnutrition and 39.6% from severe malnutrition. The highest percentage of malnutrition was found in the 0-24 month age group (71.6%). 77% of children had chronic malnutrition. Conclusions: The prevalence of malnutrition in indigenous children in Yby Yaú and Azote'y is 15%, which exceeds the national malnutrition rates in children under five years of age.]]></p></abstract>
+<kwd-group>
+<kwd lng="es"><![CDATA[Desnutrición aguda]]></kwd>
+<kwd lng="es"><![CDATA[desnutrición crónica]]></kwd>
+<kwd lng="es"><![CDATA[indígenas]]></kwd>
+<kwd lng="en"><![CDATA[Acute malnutrition]]></kwd>
+<kwd lng="en"><![CDATA[chronic malnutrition]]></kwd>
+<kwd lng="en"><![CDATA[indigenous]]></kwd>
+</kwd-group>
+</article-meta>
+</front><body><![CDATA[ <p align="right"><font size="3" face="Verdana"><b>ART&Iacute;CULO ORIGINAL</b></font></p> <p align="left">&nbsp;</p> <p align="left"><font size="4" face="Verdana"><b>Prevalencia de desnutrici&oacute;n y h&aacute;bitos alimentarios en&nbsp; ni&ntilde;os menores de 5 a&ntilde;os en las comunidades ind&iacute;genas de Yby Yau y Azote&rsquo;y, 2011</b></font></p> <p align="left"><font size="3" face="Verdana"><b><i>Prevalence of malnutrition and eating habits in children under 5 years of age in indigenous communities in Azote'y and Yby Yau, 2011</i></b></font></p> <p align="center">&nbsp;</p> <p align="left"><font size="2" face="Verdana"><b>Syntia Carolina Ruiz Valiente<sup>(1)</sup>, Manuel Ruiz Ca&ntilde;ete<sup>(2)</sup>, Bartola Cohene Velazquez<sup>(3)</sup></b></font></p> <p align="left"> <font size="2" face="Verdana">1. Hospital General Pedi&aacute;trico Ni&ntilde;os Acosta &Ntilde;u. Reducto-San Lorenzo, Paraguay.</font></p> <p align="left"> <font size="2" face="Verdana">2. Centro de Salud de Yby Yau. Paraguay.</font></p> <p align="left"> <font size="2" face="Verdana">3. Puesto de Salud de Paso Tuya. Azote&rsquo;y. Paraguay.</font></p> <p align="left"> <font size="2" face="Verdana"><b>Correspondencia</b>: Syntia Carolina Ruiz Valiente. E-mail: scrv_py@hotmail.com</font></p> ]]></body>
+<body><![CDATA[<p align="left"> <font size="2" face="Verdana">Recibido: 24/01/2015; Aceptado: 10/06/2015.</font></p> <p align="left"> <font size="2" face="Verdana"><i>Los autores declaran que no existen conflictos de inter&eacute;s en el presente estudio.</i></font></p> <p align="left">&nbsp;</p> <hr size="1" noshade> <p align="left"><font size="2" face="Verdana"><b>RESUMEN</b></font></p> <p align="left"><font size="2" face="Verdana"><b>Introducci&oacute;n: </b>La infancia es una etapa trascendental en el desarrollo evolutivo del hombre, para lo cual es fundamental una adecuada nutrici&oacute;n. La desnutrici&oacute;n infantil no es solo un problema de falta de alimentos, es un conflicto social m&aacute;s profundo. La prevalencia de desnutrici&oacute;n en menores de 5 a&ntilde;os del pa&iacute;s es de 5,9% seg&uacute;n datos del Instituto Nacional de Alimentaci&oacute;n y Nutrici&oacute;n. <b>Objetivo</b>: Determinar la prevalencia de desnutrici&oacute;n y h&aacute;bitos alimentarios en ni&ntilde;os menores de 5 a&ntilde;os de las comunidades ind&iacute;genas de Yby Ya&uacute; y Azote&rsquo;y. <b>Materiales y M&eacute;todos:</b> Estudio descriptivo, transversal, realizado de enero a abril del 2011, que identific&oacute; la prevalencia de desnutrici&oacute;n infantil en ni&ntilde;os ind&iacute;genas de las etnias Pa'i Tavyter&atilde; y Mbya Guaran&iacute; de 11 comunidades ind&iacute;genas de Yby Yau y Azote&rsquo;y. Fueron examinados 349 menores de 5 a&ntilde;os de edad. Para la evaluaci&oacute;n del estado nutricional se utiliz&oacute; la curva de crecimiento de la OMS. Los ni&ntilde;os/as fueron pesados/as en balanzas mec&aacute;nicas. Para la medida de la altura, los mayores de dos a&ntilde;os fueron medidos con el tallimetro y los menores de 2 a&ntilde;os con cinta m&eacute;trica. <b>Resultados:</b> Se observ&oacute; desnutrici&oacute;n en 53 ni&ntilde;os que equivale al 15% de la muestra. De estos 60,4% padec&iacute;an de desnutrici&oacute;n moderada y 39,6% desnutrici&oacute;n grave. El mayor porcentaje de desnutrici&oacute;n se encontr&oacute; en el grupo de edad de 0 a 24 meses con 71,6%. El 77% de los ni&ntilde;os ten&iacute;an desnutrici&oacute;n cr&oacute;nica. <b>Conclusiones:</b> La prevalencia de desnutrici&oacute;n en ind&iacute;genas en Yby Ya&uacute; y Azote&rsquo;y es de 15%, lo que sobrepasa los &iacute;ndices de desnutrici&oacute;n en menores de 5 a&ntilde;os del pa&iacute;s.</font></p> <p align="left"><font size="2" face="Verdana"><b>Palabras clave:</b> Desnutrici&oacute;n aguda, desnutrici&oacute;n cr&oacute;nica, ind&iacute;genas.</font></p> <p align="left">&nbsp;</p> <p align="left"><font size="2" face="Verdana"><b>ABSTRACT</b></font></p> <p align="left"><font size="2" face="Verdana"><b>Introduction:</b> Childhood is a crucial stage in the development of humans, which is why proper nutrition is essential for this stage. Child malnutrition is not just a problem of lack of food, it is rooted in deeper social problems. The prevalence of malnutrition in children under five years of age&nbsp; in Paraguay is 5.9% , according to the Paraguayan National Institute of Food and Nutrition. <b>Objective</b>: Determine the prevalence of malnutrition and the eating habits in children under five years of age in indigenous communities in the towns of Azote'y and Ya&uacute; Yby. <b>Materials and Methods</b>: This was a descriptive, cross-sectional study conducted from January to April 2011, which identified the prevalence of child malnutrition in indigenous children in 11 ethnic Pa'i Tavyter&aacute; and Mbya Guarani indigenous communities in Azote'y and Yby Yau. We examined 349 children under 5 years of age. The World Health Organization (WHO) growth charts were used to assess nutritional status. Children were weighed with mechanical scales. To measure height, children two and older were measured with a stadiometer and children younger than two were measured with tape. <b>Results</b>: Malnutrition was observed in 53 children (15% of the sample). Of these, 60.4% were suffering from moderate malnutrition and 39.6% from severe malnutrition. The highest percentage of malnutrition was found in the 0-24 month age group (71.6%). 77% of children had chronic malnutrition. <b>Conclusions</b>: The prevalence of malnutrition in indigenous children in Yby Ya&uacute; and Azote'y is 15%, which exceeds the national malnutrition rates in children under five years of age.</font></p> <p align="left"><font size="2" face="Verdana"><b>Keywords</b>: Acute malnutrition, chronic malnutrition, indigenous.</font></p> <hr size="1" noshade> ]]></body>
+<body><![CDATA[<p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>INTRODUCCI&Oacute;N</b></font></p> <p align="left"><font size="2" face="Verdana">La desnutrici&oacute;n es una enfermedad multisist&eacute;mica, que afecta todos los &oacute;rganos y sistemas del ser humano, es producida por una disminuci&oacute;n dr&aacute;stica, aguda o cr&oacute;nica, en la disponibilidad de nutrimentos, ya sea por ingesti&oacute;n insuficiente, inadecuada absorci&oacute;n, exceso de p&eacute;rdidas o la conjunci&oacute;n de dos o m&aacute;s de estos factores. Se manifiesta por grados de d&eacute;ficit antropom&eacute;trico, signos y s&iacute;ntomas cl&iacute;nicos y alteraciones bioqu&iacute;micas, hematol&oacute;gicas e inmunol&oacute;gicas (1).</font></p> <p align="left"><font size="2" face="Verdana">La poblaci&oacute;n ind&iacute;gena est&aacute; gravemente afectada por este problema, tal vez por ser un estrato olvidado y descuidado por la poblaci&oacute;n en general y por el estado paraguayo. A pesar de las leyes, y de todos los proyectos que favorecen a esta esfera de la sociedad, a&uacute;n existe un abismo inimaginable entre lo ideal y lo real. Mientras se elaboran programas que buscan dar mejores condiciones de vida a estas comunidades, que la mayor&iacute;a de las veces solo quedan plasmados en el papel, los &iacute;ndices de desnutrici&oacute;n son alarmantes. Esto se debe probablemente a que en la sociedad posmoderna, la deforestaci&oacute;n, el uso de agrot&oacute;xicos, la invasi&oacute;n de los terratenientes despoj&oacute; a los nativos de sus tierras, oblig&aacute;ndolos a vivir en situaciones carenciales, pues estos debido a su cultura esperan que la naturaleza les ofrezca el sustento diario. Las costumbres, la econom&iacute;a y la religi&oacute;n en las etnias Paí Tavyter&atilde; y Mby`a Guaran&iacute; est&aacute;n &iacute;ntimamente relacionadas a la producci&oacute;n alimenticia e ingesta.</font></p> <p align="left"><font size="2" face="Verdana">Para el nativo guaran&iacute; es muy dif&iacute;cil comprender que el hombre es el que debe producir alimento para su sustento, pero como la sociedad actual obliga a ello, estos por no conseguir adaptarse a los cambios que se produjeron, est&aacute;n m&aacute;s expuestos a las carencias alimentarias. Seg&uacute;n datos del gobierno central en el 2008, 41,8% de los ni&ntilde;os ind&iacute;genas menores de 5 a&ntilde;os padec&iacute;an de desnutrici&oacute;n.</font></p> <p align="left"><font size="2" face="Verdana">En un estudio realizado en M&eacute;xico, la prevalencia de desnutrici&oacute;n en ind&iacute;genas fue 39,4%(2). Un 44% present&oacute; uno o m&aacute;s signos cl&iacute;nicos de malnutrici&oacute;n. Seg&uacute;n el Instituto Nacional de Encuestas y Censos del Ecuador (2001 y 2006) 40,1% de los ni&ntilde;os ind&iacute;genas menores de 5 a&ntilde;os tienen desnutrici&oacute;n cr&oacute;nica (3).</font></p> <p align="left"><font size="2" face="Verdana">En Caracas, se hizo un estudio con la poblaci&oacute;n infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, y ellos obtuvieron el siguiente resultado: El diagn&oacute;stico nutricional hallado con mayor frecuencia fue Nutrici&oacute;n normal (55%) seguida por Desnutrici&oacute;n Subcl&iacute;nica (15%) y Desnutrici&oacute;n Leve (12%). En l&iacute;neas generales, un 55% de la poblaci&oacute;n se encontraba en rangos de nutrici&oacute;n normal, mientras el 45% restante presentaba problema de malnutrici&oacute;n comprendiendo &eacute;sta por d&eacute;ficit y por exceso (4).</font></p> <p align="left"><font size="2" face="Verdana">En el Brasil en un estudio realizado para determinar el perfil nutricional de los abor&iacute;genes menores de 5 a&ntilde;os de Kaing&aacute;ngen Paran&aacute; vieron que cuando utilizado los criterios propuestos por la OMS, se registr&oacute; una alta prevalencia de d&eacute;ficit Estatura/Edad, con uno en cuatro ni&ntilde;os (24,8%) que presentaba este diagn&oacute;stico. El d&eacute;ficit de Peso/Edad fue diagnosticado en 9,2% de los ni&ntilde;os evaluados. Los &iacute;ndices de peso para la altura diagnosticaron solo tres ni&ntilde;os (2,1%) como desnutridas agudas (5).</font></p> <p align="left"><font size="2" face="Verdana">En otro estudio realizado tambi&eacute;n en el Brasil, esta vez en Amazonia, con ni&ntilde;os de la etnia Suru&iacute; se observ&oacute; que los porcentajes de los ni&ntilde;os con d&eacute;ficit en los &iacute;ndices de estatura para la edad fue 31,4%, peso para la edad 12,4% y peso para la estatura 0% (6).</font></p> <p align="left"><font size="2" face="Verdana">El objetivo del presente estudio es determinar la prevalencia de desnutrici&oacute;n en ni&ntilde;os menores de 5 a&ntilde;os de las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y y conocer el comportamiento alimentario de los ni&ntilde;os/as de las comunidades ind&iacute;genas estudiadas.</font></p> ]]></body>
+<body><![CDATA[<p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>MATERIALES Y M&Eacute;TODOS</b></font></p> <p align="left"><font size="2" face="Verdana">Estudio transversal, descriptivo realizado en el periodo de enero a abril del a&ntilde;o 2011, donde se identific&oacute; la prevalencia de desnutrici&oacute;n infantil en ni&ntilde;os ind&iacute;genas de las etnias Pa&#297; Tavyter&atilde; y Mby`a Guaran&iacute; en los distritos de Yby-Ya&uacute; y Azote&rsquo;y.</font></p> <p align="left"><font size="2" face="Verdana">El tama&ntilde;o muestral total fue de 370 ni&ntilde;os, determinado a trav&eacute;s de censo realizado por el Centro de Salud de Yby-Ya&uacute; y el Puesto de Salud de Paso Tuya. Para los fines del estudio fueron identificados 349 ni&ntilde;os (94.3%) de ni&ntilde;os reci&eacute;n nacidos a menores de 5 a&ntilde;os en los distritos de Yby-Ya&uacute; y Azote'y.</font></p> <p align="left"><font size="2" face="Verdana">Las etnias que se encuentran dentro del &aacute;rea de estudio est&aacute; compuesta por los mby`a guaran&iacute; y los pa&#297; tavyter&atilde;, distribuidas en las siguientes comunidades ind&iacute;genas: Vy'apav&#7869;, Yrapey, Guyrakeha, Guyra &Ntilde;e'engatuamba, Satí;, San Juan, Mbery'o Jaguarymi, Ka'aguy Poty Rory, Yvyra'ija, Tukambiju y Takuaritiy.</font></p> <p align="left"><font size="2" face="Verdana">El trabajo se realiz&oacute; por concentraci&oacute;n, en los locales fijados por los l&iacute;deres de las distintas comunidades. Fue aplicado un cuestionario a las madres, creado para el efecto por medio de entrevista. La edad de los ni&ntilde;os fue dada por las madres, pues la mayor&iacute;a de estas no cuentan con registro de nacimiento, ni siquiera certificado de nacido vivo.</font></p> <p align="left"><font size="2" face="Verdana">Para la evaluaci&oacute;n del estado nutricional de los ni&ntilde;os se opt&oacute; por la curva del gr&aacute;fico de crecimiento de la Organizaci&oacute;n Mundial de la Salud (OMS) lo cual est&aacute; contenido en la libreta del ni&ntilde;o y la ni&ntilde;a. Los ni&ntilde;os/as fueron pesados/as en balanzas mec&aacute;nicas, los que ya consegu&iacute;an quedarse de pie fueron pesados en balanza de pie y los ni&ntilde;os menores de 1 a&ntilde;o en balanzas colgantes.</font></p> <p align="left"><font size="2" face="Verdana">Para la medida de la altura, los ni&ntilde;os mayores de dos a&ntilde;os fueron colocados en posici&oacute;n de pie, bien rectos, y fueron medidos con el tallimetro. La talla de los ni&ntilde;os menores de 2 a&ntilde;os fue realizada con cinta m&eacute;trica con el ni&ntilde;o/a en dec&uacute;bito supino en superficie recta.</font></p> <p align="left"><font size="2" face="Verdana">Los datos fueron analizados manualmente, y los gr&aacute;ficos confeccionados con el programa Microsoft Office Excel 2007.</font></p> <p align="justify">&nbsp;</p> ]]></body>
+<body><![CDATA[<p align="left"><font size="3" face="Verdana"><b>RESULTADOS</b></font></p> <p align="left"><font size="2" face="Verdana">Se evaluaron 349 ni&ntilde;os, que representan el 94,3% del total de abor&iacute;genes menores de 5 a&ntilde;os de las comunidades de Yby-Ya&uacute; y Azote&rsquo;y. Del total de 349 ni&ntilde;os, 69 % (240) son Paí; Tavyter&atilde; y 31% (109) Mby`a Guaran&iacute;. </font></p> <p align="left"><font size="2" face="Verdana">La comunidad con el mayor porcentaje de ni&ntilde;os fue la de Vy'&atilde;pav&#7869; (36,4%), y la de menor frecuencia fue la comunidad de Tekoha Kag&atilde;t&atilde;, que es una comunidad reci&eacute;n formada localizada en Pasi&ntilde;o (<a href="#2a02f1">Figura 1</a>).</font></p> <p align="center"><a name="2a02f1"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02f1.jpg"></p> <p align="left"><font size="2" face="Verdana">Viendo el perfil nutricional de los ni&ntilde;os, se pudo observar que 61% de los ni&ntilde;os/as no est&aacute;n desnutridos, 24% de los ni&ntilde;os/as est&aacute;n en riesgo de desnutrici&oacute;n y 15% est&aacute;n con desnutrici&oacute;n. Aunque se trata de un estrato social desfavorecido tambi&eacute;n se observa &iacute;ndice de sobrepeso y obesidad, en las comunidades de Vy'&atilde;pav&#7869; e Yrapey (<a href="#2a02f2">Figura 2</a>).</font></p> <p align="center"><a name="2a02f2"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02f2.jpg"></p> ]]></body>
+<body><![CDATA[<p align="left"><font size="2" face="Verdana">Teniendo presente los gr&aacute;ficos de Talla/Edad la prevalencia de desnutrici&oacute;n cr&oacute;nica es bastante elevada, pues 77% de los ni&ntilde;os padecen de desnutrici&oacute;n cr&oacute;nica. El mayor &iacute;ndice de desnutrici&oacute;n se encuentran en los primeros 24 meses de vida (<a href="#2a02t1">Tabla 1</a>). De los 53 ni&ntilde;os con desnutrici&oacute;n, 60,4% padecen de desnutrici&oacute;n moderada, y el 39,6% desnutrici&oacute;n grave. Siendo que el mayor porcentaje de desnutrici&oacute;n se observa en Vy'&atilde;pav&#7869;.</font></p> <p align="center"><a name="2a02t1"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02t1.jpg"></p> <p align="left"><font size="2" face="Verdana">Se estudi&oacute; adem&aacute;s el comportamiento alimentario de estos ni&ntilde;os, viendo que alimentos preferencialmente hacen parte de su dieta y la edad de introducci&oacute;n de los mismos, la mayor&iacute;a de las madres introducen alg&uacute;n tipo alimento entre los 6 y 8 meses de edad (<a href="#2a02f3">Figura 3</a>) y los primeros alimentos introducidos dependen del lugar donde estos habitan. El caldo de pescado es uno de los primeros alimentos introducidos en las comunidades que viven cerca de los r&iacute;os, entretanto el 60% inician la alimentaci&oacute;n con caldo de arroz y caldo de fideo.</font></p> <p align="center"><a name="2a02f3"></a></p> <p align="left">&nbsp;</p> <p align="center"><img src="../../../../../img/revistas/ped/v42n2/2a02f3.jpg"></p> <p align="left"><font size="2" face="Verdana">Al observar la frecuencia en que se alimentan estos ni&ntilde;os, el 64% se alimenta tres veces al d&iacute;a, el 20% menos de 3 veces al d&iacute;a y solo el 16 % m&aacute;s de tres veces al d&iacute;a.</font></p> <p align="left"><font size="2" face="Verdana">El principal nutriente en la dieta son los carbohidratos, el 47% de los ni&ntilde;os consumen carbohidratos m&aacute;s de 5 veces por semana, y el 21% menos de 3 veces por semana. El mayor porcentaje de consumo de prote&iacute;nas se observa en las comunidades que se encuentran cerca de r&iacute;os (Guyra &Ntilde;e`engatuamba y Mbery'o Jaguarymi), siendo que 70% consume prote&iacute;nas menos de 3 veces por semana, y solo el 3% m&aacute;s de cinco veces por semana. El consumo de verduras y hortalizas es muy escaso, el 91% consume verduras y hortalizas menos de 3 veces por semana, el 2% m&aacute;s de 5 veces y 7% entre 3 y 5 veces por semana.</font></p> ]]></body>
+<body><![CDATA[<p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>DISCUSI&Oacute;N</b></font></p> <p align="left"><font size="2" face="Verdana">A lo largo de toda la historia de la humanidad, la desnutrici&oacute;n ha sido una patolog&iacute;a de las clases sociales menos privilegiadas, son los que no poseen las condiciones necesarias para tener una vida digna, donde la educaci&oacute;n, salud, recursos econ&oacute;micos son miserables, donde esta dolencia alcanza su auge (7).</b></font></p> <p align="left"><font size="2" face="Verdana">Seg&uacute;n los datos del Censo realizado por la Unidad de Salud Ind&iacute;gena que se encuentra en el Distrito de Yby-Ya&uacute;, los Puestos de Salud de Yby- Ya&uacute; y Azote&rsquo;y en el tercer trimestre del A&ntilde;o 2010, se encontraron 328 ni&ntilde;os de hasta 60 meses (8). Al realizar los trabajos de campo, este n&uacute;mero se elev&oacute; a 349 individuos, por lo que se hizo un nuevo censo solo con los ni&ntilde;os de este grupo etario. Ese fen&oacute;meno tal vez, se deba a la migraciones que se desarrollan normalmente entre los guaran&iacute;. Al observar la historia, y tambi&eacute;n por la experiencia que se adquiri&oacute; durante el trabajo de campo, se pudo observar la familia ling&uuml;&iacute;stica a la cual pertenecen los mby`a y los paí; (la guaran&iacute;) son n&oacute;madas, es com&uacute;n que migren a otras comunidades, en un mismo Tekoha (9,10).</b></font></p> <p align="left"><font size="2" face="Verdana">La poblaci&oacute;n diana fue de 370 ni&ntilde;os menores de 5 a&ntilde;os de los cuales se lleg&oacute; a entrevistar a las madres de 349 y se hizo las mediciones antropom&eacute;tricas posteriormente. En la mayor&iacute;a de las comunidades ind&iacute;genas se obtuvo el 100% de participaci&oacute;n, son excepciones las comunidades de Yrapey y Takuaritiy.</b></font></p> <p align="left"><font size="2" face="Verdana">Del total de ni&ntilde;os/as, la etnia de mayor prevalencia fue la de Paí; Tavyter&atilde;. En relaci&oacute;n al sexo, las comunidades son bastante equilibradas, con una ligera prevalencia del sexo masculino sobre el femenino.</b></font></p> <p align="left"><font size="2" face="Verdana">Seg&uacute;n datos de la UNICEF en Paraguay se observa 3,4% de desnutrici&oacute;n aguda en ni&ntilde;os menores de 5 a&ntilde;os (11). La prevalencia de desnutrici&oacute;n en los ni&ntilde;os paraguayos menores de 5 a&ntilde;os en el &aacute;rea rural es de 5,9% y en el &aacute;rea urbana es de 4,5% (12). Existen pocas publicaciones sobre este tema en abor&iacute;genes menores de 5 a&ntilde;os, siendo que el mayor n&uacute;mero de publicaciones fue realizado por el Brasil (12,4%), M&eacute;xico (39,4%) y Ecuador.</b></font></p> <p align="left"><font size="2" face="Verdana">La prevalencia de desnutrici&oacute;n en las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y es de 15,2%, observando los gr&aacute;ficos de Peso/edad si de 2 a&ntilde;os y Peso/Talla en mayores de 2 a&ntilde;os y menores de 5 a&ntilde;os. Las comunidades donde la desnutrici&oacute;n son m&aacute;s prevalentes son Guyrakeha e Yvyra'ija; en Satí; y Tekoha Kagat&atilde; no se encontr&oacute; ni&ntilde;os desnutridos.</b></font></p> <p align="left"><font size="2" face="Verdana">De 53 ni&ntilde;os con desnutrici&oacute;n, 60,4% padecen de desnutrici&oacute;n moderada, y el 39,6% desnutrici&oacute;n grave. El grupo con mayor &iacute;ndice de desnutrici&oacute;n, se encuentra durante los primeros 24 meses, pues es en esta etapa donde el organismo requiere una mayor cantidad de nutrientes por el mayor crecimiento. Adem&aacute;s, despu&eacute;s de los 6 meses se inicia la introducci&oacute;n de otros alimentos. Estos dos factores, asociados aumentan el &iacute;ndice de desnutrici&oacute;n en este grupo de edad.</b></font></p> <p align="left"><font size="2" face="Verdana">De la poblaci&oacute;n total de los ni&ntilde;os estudiados el 23,8% est&aacute;n con riesgo de desnutrici&oacute;n. Seg&uacute;n el Instituto Nacional de Alimentaci&oacute;n y Nutrici&oacute;n (INAN) en el a&ntilde;o 2010, 13,6% de ni&ntilde;os menores de 5 a&ntilde;os del &aacute;rea urbana y 16,2% del &aacute;rea rural del Paraguay sufren desnutrici&oacute;n cr&oacute;nica. En una encuesta realizada por la Direcci&oacute;n General de Estad&iacute;stica, Encuestas y Censos en el a&ntilde;o 2008, 41,8% de los ni&ntilde;os/as ind&iacute;genas menores de cinco a&ntilde;os padecen de desnutrici&oacute;n cr&oacute;nica. Observadas las medidas de Talla/Edad el 77% de los ni&ntilde;os padecen de desnutrici&oacute;n cr&oacute;nica. Ese dato es alarmante, porque la desnutrici&oacute;n cr&oacute;nica es consecuencia de una carencia prolongada de alimentos o enfermedades sucesivas. En Tukambiju, Mbery'o Jaguarymi, Guyrakeha, Yvyra'ija y Satí; son comunidades con una prevalencia mayor al 80% de ni&ntilde;os/as con talla baja para la edad.</b></font></p> ]]></body>
+<body><![CDATA[<p align="left"><font size="2" face="Verdana">El &iacute;ndice de desnutrici&oacute;n en ind&iacute;genas en los distritos de Yby-Ya&uacute; y Azote&rsquo;y, sobrepasa la prevalencia general de desnutrici&oacute;n en menores de 5 a&ntilde;os del pa&iacute;s, lo cual est&aacute; alrededor de 5.9% seg&uacute;n datos del INAN.</b></font></p> <p align="left"><font size="2" face="Verdana">En las comunidades ind&iacute;genas se puede observar que un porcentaje razonable introduce alimentos entre los 6 meses y antes de los 9 meses. El porcentaje de los que introducen antes de los 6 meses es de 18,6% y entre los 9 meses y un a&ntilde;o es de 27%. Se pudo observar que, ocho ni&ntilde;os tuvieron lactancia materna exclusiva por m&aacute;s de 1 a&ntilde;o. Todos los ni&ntilde;os/as con lactancia materna exclusiva en la fecha de la recolecci&oacute;n de datos ten&iacute;a menos de 6 meses o 6 meses. El caldo de fideo y de arroz ocupa el primer y segundo lugar respectivamente como primer alimento introducido por las madres. Los alimentos que deber&iacute;an ser introducidos inicialmente como el pur&eacute; de frutas y verduras ocupan un peque&ntilde;o porcentaje en la lista. Otros alimentos que se tendr&iacute;an que introducir despu&eacute;s de los 9 meses, de preferencia a los un a&ntilde;o, como por ejemplo el caldo de poroto, caldo de pescado, leche de vaca y huevo son los primeros alimentos que se introducen.</b></font></p> <p align="left"><font size="2" face="Verdana">El 64% de los ni&ntilde;os se alimentan tres veces al d&iacute;a, el 20,5% menos de tres veces y 15,5% m&aacute;s de tres veces al d&iacute;a.</b></font></p> <p align="left"><font size="2" face="Verdana">El 69,5% de los ni&ntilde;os/as de las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y consumen prote&iacute;nas menos de tres veces por semana; 27,3% consumen de tres a cinco veces por semana los diferentes tipos de prote&iacute;nas, teniendo predominancia el consumo de pez. Solo 3,2% consume prote&iacute;nas m&aacute;s de 5 veces. Las comunidades que viven cerca de bosques, r&iacute;os o arroyos son los que m&aacute;s consumen prote&iacute;nas.</b></font></p> <p align="left"><font size="2" face="Verdana">Los carbohidratos son la principal fuente de alimentaci&oacute;n de los ni&ntilde;os y ni&ntilde;as de las comunidades ind&iacute;genas de Yby-Ya&uacute; y Azote&rsquo;y. Eso se debe a que son los alimentos de m&aacute;s f&aacute;cil adquisici&oacute;n y los m&aacute;s accesibles econ&oacute;micamente hablando.</b></font></p> <p align="left"><font size="2" face="Verdana">En las comunidades ind&iacute;genas el consumo de verduras y hortalizas es escaso. Las comunidades que m&aacute;s consumen verduras y hortalizas son Mberyo Jaguarymi y Takuaritiy.</b></font></p> <p align="left"><font size="2" face="Verdana">Este trabajo refleja la realidad de las comunidades ind&iacute;genas de los dos distritos observados, no podemos extrapolar estas mismas cifras en el departamento de Concepci&oacute;n, o en todo el pa&iacute;s por el tama&ntilde;o de la muestra, es necesario hacer nuevos estudios con un tama&ntilde;o muestral mayor para obtener una visi&oacute;n del verdadero estado nutricional de los ni&ntilde;os ind&iacute;genas. El porcentaje de desnutrici&oacute;n es alto, pero se trata de distritos con no muchos recursos econ&oacute;micos, donde la pobreza es una realidad a&uacute;n en otros estratos sociales.</b></font></p> <p align="left"><font size="2" face="Verdana">La realidad ind&iacute;gena es un problema real, y una manera de reducir estas cifras es ense&ntilde;&aacute;ndoles a producir su propio alimento. Para ello no debemos luchar con su cultura ni intentar hacerlos ver el mundo a trav&eacute;s de nuestra realidad, sino dentro de sus costumbres encontrar formas de que ellos tengan condiciones de un mejor porvenir.</font></p> <p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>AGRADECIMIENTOS</b></font></p> ]]></body>
+<body><![CDATA[<p align="left"><font size="2" face="Verdana">A las comunidades ind&iacute;genas que participaron en nuestro estudio, los profesionales de blanco del Centro de Salud de Yby-Yau y Azote&rsquo;y, a la Comunidad de Hermanas de la Divina Providencia de Yby-Yau, a la Dra. Blanca Villalba y a la Dra. Gloria Mart&iacute;nez.</font></p> <p align="justify">&nbsp;</p> <p align="left"><font size="3" face="Verdana"><b>REFERENCIAS</b></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">1. Monteiro CA. Fome, desnutri&ccedil;&atilde;o e pobreza: al&eacute;m da sem&acirc;ntica. Sa&uacute;de Soc. 2003;12(1):7-11. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102986&pid=S1683-9803201500020000200001&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">2. Vi&ntilde;as MR, Fr&iacute;as ML, Verd&uacute; JM. Entorno social y desnutrici&oacute;n en ni&ntilde;os de 1 a 4 a&ntilde;os de comunidades ind&iacute;genas de M&eacute;xico. Rev Esp Nutr Comunitaria. 2005;11(3):128-34. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102988&pid=S1683-9803201500020000200002&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">3. INEC. Ecuador: 40,1% de ind&iacute;genas con desnutrici&oacute;n cr&oacute;nica. Ecuador: Estudio del INEC; 2009. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102990&pid=S1683-9803201500020000200003&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">4. Chumpitaz D, Russo A, Del NogaL B, Case C, Lares M. Evaluaci&oacute;n nutricional de la poblaci&oacute;n infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, agosto-octubre 2004. AVFT. 2006;25(1):26-31. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102992&pid=S1683-9803201500020000200004&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">5. Kuhl AM, Tittoni C, Leite MS, Bastos JL. Perfil Nutricional e fatores associados &agrave; ocorr&ecirc;ncia de desnutri&ccedil;&atilde;o entre crian&ccedil;as ind&iacute;genas Kaing&aacute;ng da Terra Ind&iacute;gena de Mangueirinha, Paran&aacute;, Brasil. Cad Sa&uacute;de P&uacute;blica. 2009;25(2):409-420. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102994&pid=S1683-9803201500020000200005&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">6. Orellana JD, Coimbra Jr. CE, Louren&ccedil;o AE, Santos RV. Estado nutricional e anemia en crian&ccedil;as Suru&iacute;, Amaz&ocirc;nia, Brasil. J Pediatr (Rio J). 2006;82(5):383-88. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102996&pid=S1683-9803201500020000200006&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">7. Organizaci&oacute;n de las Naciones Unidas. Foro permanente para las cuestiones ind&iacute;genas: informe sobre el quinto per&iacute;odo de sesiones (15 a 26 de mayo de 2006). Nueva York: Naciones Unidas; 2006. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=102998&pid=S1683-9803201500020000200007&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">8. Centro de Salud de Yby-Yau. Censo local de las comunidades ind&iacute;genas. Yby-Yau; 2010. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103000&pid=S1683-9803201500020000200008&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">9. Chase-Sardi M, Brun A, Enciso MA. Situaci&oacute;n sociocultural, econ&oacute;mica, jur&iacute;dico-pol&iacute;tico actual de las comunidades ind&iacute;genas del Paraguay. Asunci&oacute;n: UCA; 1989. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103002&pid=S1683-9803201500020000200009&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">10. Meliá B, Grunberg G, Grunberg F. Paî -Tavyterã: etnograf&iacute;a guaran&iacute; del Paraguay contempor&aacute;neo. 2da. ed. Asunci&oacute;n: Centro de Estudios Antrop&oacute;logicos de la Universidad Cat&oacute;lica; 2008. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103004&pid=S1683-9803201500020000200010&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">11. FAO. Panorama de la seguridad alimentaria y nutricional en Am&eacute;rica Latina y el Caribe 2013. FAO; 2014. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103006&pid=S1683-9803201500020000200011&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --> </font></p> <!-- ref --><p align="left"><font size="2" face="Verdana">12. Masi C, S&aacute;nchez Bernal S, Dallman D, Rodas A, Morinigo G, Mendoza L. Perfil nutricional de ni&ntilde;os menores de 5 a&ntilde;os que acuden a servicios p&uacute;blicos de salud en el Paraguay. Asunci&oacute;n: INAN; 2010. &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[&#160;<a href="javascript:void(0);" onclick="javascript: window.open('/scielo.php?script=sci_nlinks&ref=103008&pid=S1683-9803201500020000200012&lng=','','width=640,height=500,resizable=yes,scrollbars=1,menubar=yes,');">Links</a>&#160;]<!-- end-ref --></font></p> ]]></body><back>
+<ref-list>
+<ref id="B1">
+<label>1</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Monteiro]]></surname>
+<given-names><![CDATA[CA]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Fome, desnutrição e pobreza: além da semântica]]></article-title>
+<source><![CDATA[Saúde Soc]]></source>
+<year>2003</year>
+<volume>12</volume>
+<numero>1</numero>
+<issue>1</issue>
+<page-range>7-11</page-range></nlm-citation>
+</ref>
+<ref id="B2">
+<label>2</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Viñas]]></surname>
+<given-names><![CDATA[MR]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Frías]]></surname>
+<given-names><![CDATA[ML]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Verdú]]></surname>
+<given-names><![CDATA[JM]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="es"><![CDATA[Entorno social y desnutrición en niños de 1 a 4 años de comunidades indígenas de México]]></article-title>
+<source><![CDATA[Rev Esp Nutr Comunitaria]]></source>
+<year>2005</year>
+<volume>11</volume>
+<numero>3</numero>
+<issue>3</issue>
+<page-range>128-34</page-range></nlm-citation>
+</ref>
+<ref id="B3">
+<label>3</label><nlm-citation citation-type="book">
+<collab>INEC</collab>
+<source><![CDATA[Ecuador: 40,1% de indígenas con desnutrición crónica]]></source>
+<year>2009</year>
+<publisher-loc><![CDATA[Ecuador ]]></publisher-loc>
+<publisher-name><![CDATA[Estudio del INEC]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B4">
+<label>4</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Chumpitaz]]></surname>
+<given-names><![CDATA[D]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Russo]]></surname>
+<given-names><![CDATA[A]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Del NogaL]]></surname>
+<given-names><![CDATA[B]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Case]]></surname>
+<given-names><![CDATA[C]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Lares]]></surname>
+<given-names><![CDATA[M]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Evaluación nutricional de la población infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, agosto-octubre 2004]]></article-title>
+<source><![CDATA[AVFT]]></source>
+<year>2006</year>
+<volume>25</volume>
+<numero>1</numero>
+<issue>1</issue>
+<page-range>26-31</page-range></nlm-citation>
+</ref>
+<ref id="B5">
+<label>5</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Kuhl]]></surname>
+<given-names><![CDATA[AM]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Tittoni]]></surname>
+<given-names><![CDATA[C]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Leite]]></surname>
+<given-names><![CDATA[MS]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Bastos]]></surname>
+<given-names><![CDATA[JL]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Perfil Nutricional e fatores associados à ocorrência de desnutrição entre crianças indígenas Kaingáng da Terra Indígena de Mangueirinha, Paraná, Brasil]]></article-title>
+<source><![CDATA[Cad Saúde Pública]]></source>
+<year>2009</year>
+<volume>25</volume>
+<numero>2</numero>
+<issue>2</issue>
+<page-range>409-420</page-range></nlm-citation>
+</ref>
+<ref id="B6">
+<label>6</label><nlm-citation citation-type="journal">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Orellana]]></surname>
+<given-names><![CDATA[JD]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Coimbra Jr]]></surname>
+<given-names><![CDATA[CE]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Lourenço]]></surname>
+<given-names><![CDATA[AE]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Santos]]></surname>
+<given-names><![CDATA[RV]]></given-names>
+</name>
+</person-group>
+<article-title xml:lang="pt"><![CDATA[Estado nutricional e anemia en crianças Suruí, Amazônia, Brasil]]></article-title>
+<source><![CDATA[J Pediatr (Rio J)]]></source>
+<year>2006</year>
+<volume>82</volume>
+<numero>5</numero>
+<issue>5</issue>
+<page-range>383-88</page-range></nlm-citation>
+</ref>
+<ref id="B7">
+<label>7</label><nlm-citation citation-type="book">
+<collab>Organización de las Naciones Unidas</collab>
+<source><![CDATA[Foro permanente para las cuestiones indígenas: informe sobre el quinto período de sesiones (15 a 26 de mayo de 2006)]]></source>
+<year>2006</year>
+<publisher-loc><![CDATA[Nueva York ]]></publisher-loc>
+<publisher-name><![CDATA[Naciones Unidas]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B8">
+<label>8</label><nlm-citation citation-type="">
+<collab>Centro de Salud de Yby-Yau</collab>
+<source><![CDATA[Censo local de las comunidades indígenas]]></source>
+<year>2010</year>
+<publisher-loc><![CDATA[Yby-Yau ]]></publisher-loc>
+</nlm-citation>
+</ref>
+<ref id="B9">
+<label>9</label><nlm-citation citation-type="book">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Chase-Sardi]]></surname>
+<given-names><![CDATA[M]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Brun]]></surname>
+<given-names><![CDATA[A]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Enciso]]></surname>
+<given-names><![CDATA[MA]]></given-names>
+</name>
+</person-group>
+<source><![CDATA[Situación sociocultural, económica, jurídico-político actual de las comunidades indígenas del Paraguay]]></source>
+<year>1989</year>
+<publisher-loc><![CDATA[Asunción ]]></publisher-loc>
+<publisher-name><![CDATA[UCA]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B10">
+<label>10</label><nlm-citation citation-type="book">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Meliá]]></surname>
+<given-names><![CDATA[B]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Grunberg]]></surname>
+<given-names><![CDATA[G]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Grunberg]]></surname>
+<given-names><![CDATA[F]]></given-names>
+</name>
+</person-group>
+<source><![CDATA[Paî -Tavyterã: etnografía guaraní del Paraguay contemporáneo. 2da. ed]]></source>
+<year>2008</year>
+<publisher-loc><![CDATA[Asunción ]]></publisher-loc>
+<publisher-name><![CDATA[Centro de Estudios Antropólogicos de la Universidad Católica]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B11">
+<label>11</label><nlm-citation citation-type="book">
+<collab>FAO</collab>
+<source><![CDATA[Panorama de la seguridad alimentaria y nutricional en América Latina y el Caribe 2013]]></source>
+<year>2014</year>
+<publisher-name><![CDATA[FAO]]></publisher-name>
+</nlm-citation>
+</ref>
+<ref id="B12">
+<label>12</label><nlm-citation citation-type="book">
+<person-group person-group-type="author">
+<name>
+<surname><![CDATA[Masi]]></surname>
+<given-names><![CDATA[C]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Sánchez Bernal]]></surname>
+<given-names><![CDATA[S]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Dallman]]></surname>
+<given-names><![CDATA[D]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Rodas]]></surname>
+<given-names><![CDATA[A]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Morinigo]]></surname>
+<given-names><![CDATA[G]]></given-names>
+</name>
+<name>
+<surname><![CDATA[Mendoza]]></surname>
+<given-names><![CDATA[L]]></given-names>
+</name>
+</person-group>
+<source><![CDATA[Perfil nutricional de niños menores de 5 años que acuden a servicios públicos de salud en el Paraguay]]></source>
+<year>2010</year>
+<publisher-loc><![CDATA[Asunción ]]></publisher-loc>
+<publisher-name><![CDATA[INAN]]></publisher-name>
+</nlm-citation>
+</ref>
+</ref-list>
+</back>
+</article>
diff --git a/python/tests/files/small.json b/python/tests/files/small.json
index 35bf62d..3839c99 100644
--- a/python/tests/files/small.json
+++ b/python/tests/files/small.json
@@ -19,42 +19,28 @@
{"name": "J Doe", "given_name": "J", "surname": "Doe"}
],
"journal": {
- "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
- "eissn": null,
- "issn": null,
- "issue": null,
- "publisher": null,
- "volume": null
+ "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678"
},
"date": "2000",
- "doi": null,
"citations": [
{ "authors": [{"name": "A Seaperson", "given_name": "A", "surname": "Seaperson"}],
"date": "2001",
"id": "b0",
"index": 0,
- "issue": null,
"journal": "Letters in the Alphabet",
- "publisher": null,
+ "pages": "1-11",
"title": "Everything is Wonderful",
- "url": null,
"volume": "20"},
{ "authors": [],
"date": "2011-03-28",
"id": "b1",
"index": 1,
- "issue": null,
"journal": "The Dictionary",
- "publisher": null,
"title": "All about Facts",
- "url": null,
"volume": "14"}
],
"abstract": "Everything you ever wanted to know about nothing",
"body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
- "acknowledgement": null,
- "annex": null,
- "fatcat_release": null,
"grobid_timestamp": "2018-04-02T00:31+0000",
"grobid_version": "0.5.1-SNAPSHOT",
"language_code": "en"
diff --git a/python/tests/test_common.py b/python/tests/test_common.py
deleted file mode 100644
index 34d50ed..0000000
--- a/python/tests/test_common.py
+++ /dev/null
@@ -1,40 +0,0 @@
-
-from common import *
-
-
-def test_parse_cdx_line():
-
- raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
- correct = {
- 'key': "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
- 'file:mime': "application/pdf",
- 'file:cdx': {
- 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'dt': "20170828233154",
- 'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
- 'offset': 931661233,
- 'c_size': 210251,
- },
- 'f:c': {
- 'u': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'd': "2017-08-28T23:31:54",
- 'f': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
- 'o': 931661233,
- 'c': 1,
- }
- }
-
- assert parse_cdx_line(raw) == correct
- assert parse_cdx_line(raw + "\n") == correct
- assert parse_cdx_line(raw + " extra_field") == correct
-
-def test_invalid_cdx():
-
- print("missing warc")
- raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
- assert parse_cdx_line(raw) == None
-
- print("bad datetime")
- raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
- assert parse_cdx_line(raw) == None
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 10560cd..dce64bc 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -1,75 +1,222 @@
+import json
+import struct
import pytest
-import struct
import responses
+from test_wayback import cdx_client, wayback_client # noqa:F401
-from sandcrawler import GrobidClient, GrobidWorker, CdxLinePusher, BlackholeSink, WaybackClient
-
+from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
-with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f:
+with open("tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml", "rb") as f:
REAL_TEI_XML = f.read()
-@responses.activate
-def test_grobid_503():
- client = GrobidClient(host_url="http://localhost:8070")
+@pytest.fixture
+def grobid_client():
+ client = GrobidClient(
+ host_url="http://dummy-grobid",
+ )
+ return client
+
+
+@responses.activate
+def test_grobid_503(grobid_client):
status = b'{"status": "done broke due to 503"}'
- responses.add(responses.POST,
- 'http://localhost:8070/api/processFulltextDocument', status=503,
- body=status)
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=503,
+ body=status,
+ )
- resp = client.process_fulltext(FAKE_PDF_BYTES)
+ resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
# grobid gets POST 1x times
assert len(responses.calls) == 1
- assert resp['status_code'] == 503
- assert resp['status'] == "error"
+ assert resp["status_code"] == 503
+ assert resp["status"] == "error"
+
@responses.activate
-@pytest.mark.skip(reason="XXX: need to fix unicode/bytes something something")
-def test_grobid_success():
+def test_grobid_success_iso_8859(grobid_client):
+ """
+ This might have been the old GROBID behavior, with default encoding? Can't really remember.
+ """
- client = GrobidClient(host_url="http://localhost:8070")
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
- responses.add(responses.POST,
- 'http://localhost:8070/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
- resp = client.process_fulltext(FAKE_PDF_BYTES)
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert resp["status_code"] == 200
+ assert resp["status"] == "success"
+ # print(type(resp['tei_xml']))
+ # print(type(REAL_TEI_XML))
+ assert resp["tei_xml"] == REAL_TEI_XML.decode("ISO-8859-1")
+
+
+@responses.activate
+def test_grobid_success(grobid_client):
+
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="application/xml; charset=UTF-8",
+ )
+
+ resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
# grobid gets POST 1x times
assert len(responses.calls) == 1
- assert resp['status_code'] == 200
- assert resp['status'] == "success"
- print(type(resp['tei_xml']))
- print(type(REAL_TEI_XML))
- assert resp['tei_xml'] == REAL_TEI_XML.decode('utf-8')
- #assert resp['tei_xml'].split('\n')[:3] == REAL_TEI_XML.split('\n')[:3]
+ assert resp["status_code"] == 200
+ assert resp["status"] == "success"
+ assert resp["tei_xml"] == REAL_TEI_XML.decode("UTF-8")
+
@responses.activate
-def test_grobid_worker_cdx():
+def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811
sink = BlackholeSink()
- grobid_client = GrobidClient(host_url="http://localhost:8070")
- wayback_client = WaybackClient()
worker = GrobidWorker(grobid_client, wayback_client, sink=sink)
- responses.add(responses.POST,
- 'http://localhost:8070/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
-
- with open('tests/files/example.cdx', 'r') as cdx_file:
- pusher = CdxLinePusher(worker, cdx_file,
- filter_http_statuses=[200], filter_mimetypes=['application/pdf'])
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
+
+ with open("tests/files/example.cdx", "r") as cdx_file:
+ pusher = CdxLinePusher(
+ worker,
+ cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=["application/pdf"],
+ )
pusher_counts = pusher.run()
- assert pusher_counts['total']
- assert pusher_counts['pushed'] == 7
- assert pusher_counts['pushed'] == worker.counts['total']
+ assert pusher_counts["total"]
+ assert pusher_counts["pushed"] == 7
+ assert pusher_counts["pushed"] == worker.counts["total"]
+
+ assert len(responses.calls) == worker.counts["total"]
- assert len(responses.calls) == worker.counts['total']
+@responses.activate
+def test_grobid_refs_978(grobid_client):
+
+ with open("tests/files/crossref_api_work_978-3-030-64953-1_4.json", "r") as f:
+ crossref_work = json.loads(f.read())
+
+ with open("tests/files/grobid_refs_978-3-030-64953-1_4.tei.xml", "rb") as f:
+ xml_bytes = f.read()
+ assert "\u2013".encode("utf-8") in xml_bytes
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processCitationList",
+ status=200,
+ body=xml_bytes,
+ content_type="application/xml; charset=UTF-8",
+ )
+
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ refs = refs_row["refs_json"]
+ assert len(refs) == 3
+ assert set([r["id"] for r in refs]) == set(["4_CR93", "4_CR193", "4_CR210"])
+
+ # test case of no references
+ crossref_work["message"]["reference"] = []
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ assert len(refs_row["refs_json"]) == 0
+
+ # test that 'message' works also
+ refs_row = grobid_client.crossref_refs(crossref_work["message"])
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1007/978-3-030-64953-1_4"
+ assert refs_row["source_ts"] == "2021-05-10T22:08:45Z"
+ assert len(refs_row["refs_json"]) == 0
+
+ # grobid gets no additional POST from the above empty queries
+ assert len(responses.calls) == 1
+
+
+@responses.activate
+def test_grobid_refs_s104(grobid_client):
+
+ # test another file
+ with open("tests/files/crossref_api_work_s1047951103000064.json", "r") as f:
+ crossref_work = json.loads(f.read())
+
+ with open("tests/files/grobid_refs_s1047951103000064.tei.xml", "rb") as f:
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processCitationList",
+ status=200,
+ body=f.read(),
+ content_type="application/xml; charset=UTF-8",
+ )
+
+ refs_row = grobid_client.crossref_refs(crossref_work)
+
+ # GROBID gets one more POST
+ assert len(responses.calls) == 1
+
+ assert refs_row["source"] == "crossref"
+ assert refs_row["source_id"] == "10.1017/s1047951103000064"
+ assert refs_row["source_ts"] == "2021-06-10T05:35:02Z"
+ refs = refs_row["refs_json"]
+ assert len(refs) == 24
+ assert set([r["id"] for r in refs]) == set(
+ [
+ "S1047951103000064_ref025",
+ "S1047951103000064_ref013",
+ "S1047951103000064_ref012",
+ "S1047951103000064_ref041",
+ "S1047951103000064_ref002",
+ "S1047951103000064_ref043",
+ "S1047951103000064_ref037",
+ "S1047951103000064_ref035",
+ "S1047951103000064_ref003",
+ "S1047951103000064_ref005",
+ "S1047951103000064_ref017",
+ "S1047951103000064_ref016",
+ "S1047951103000064_ref001",
+ "S1047951103000064_ref039",
+ "S1047951103000064_ref032",
+ "S1047951103000064_ref014",
+ "S1047951103000064_ref008",
+ "S1047951103000064_ref038",
+ "S1047951103000064_ref018",
+ "S1047951103000064_ref027",
+ "S1047951103000064_ref034",
+ "S1047951103000064_ref044",
+ "S1047951103000064_ref006",
+ "S1047951103000064_ref030",
+ ]
+ )
diff --git a/python/tests/test_grobid2json.py b/python/tests/test_grobid2json.py
index 8497b10..b00a88d 100644
--- a/python/tests/test_grobid2json.py
+++ b/python/tests/test_grobid2json.py
@@ -1,22 +1,28 @@
-
-import xml
import json
+import xml
+
import pytest
-from grobid2json import *
+from grobid_tei_xml import parse_document_xml
def test_small_xml():
-
- with open('tests/files/small.xml', 'r') as f:
+ """
+ This used to be a test of grobid2json; now it is a compatability test for
+ the to_legacy_dict() feature of grobid_tei_xml.
+ """
+
+ with open("tests/files/small.xml", "r") as f:
tei_xml = f.read()
- with open('tests/files/small.json', 'r') as f:
- json_form = json.loads(f.read())
+ with open("tests/files/small.json", "r") as f:
+ json_form = json.loads(f.read())
+
+ tei_doc = parse_document_xml(tei_xml)
+ assert tei_doc.to_legacy_dict() == json_form
- assert teixml2json(tei_xml) == json_form
def test_invalid_xml():
with pytest.raises(xml.etree.ElementTree.ParseError):
- teixml2json("this is not XML")
+ parse_document_xml("this is not XML")
with pytest.raises(ValueError):
- teixml2json("<xml></xml>")
+ parse_document_xml("<xml></xml>")
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
new file mode 100644
index 0000000..043c63d
--- /dev/null
+++ b/python/tests/test_html.py
@@ -0,0 +1,7 @@
+from sandcrawler.html import extract_fulltext_url
+
+
+def test_extract_fulltext_url():
+
+ resp = extract_fulltext_url("asdf", b"asdf")
+ assert resp == {}
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
new file mode 100644
index 0000000..ba4acf1
--- /dev/null
+++ b/python/tests/test_html_ingest.py
@@ -0,0 +1,10 @@
+from sandcrawler.ingest_html import *
+
+
+def test_html_extract_ojs3() -> None:
+
+ with open("tests/files/first_monday_ojs3_fulltext.html", "rb") as f:
+ ojs3_html = f.read()
+
+ fulltext = html_extract_body_teixml(ojs3_html)
+ assert fulltext["status"] == "success"
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
new file mode 100644
index 0000000..69bd211
--- /dev/null
+++ b/python/tests/test_html_metadata.py
@@ -0,0 +1,261 @@
+import datetime
+
+import pytest
+
+from sandcrawler.html_metadata import *
+
+
+def test_html_metadata_plos() -> None:
+
+ with open("tests/files/plos_one_article.html", "r") as f:
+ plos_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(plos_html))
+ assert meta is not None
+ assert (
+ meta.title
+ == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
+ )
+ assert meta.doi == "10.1371/journal.pone.0213978"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ )
+ assert meta.contrib_names == [
+ "Yang Li",
+ "Tuanjie Wang",
+ "Lin Wang",
+ "Mingjun Sun",
+ "Zhizhong Cui",
+ "Shuang Chang",
+ "Yongping Wu",
+ "Xiaodong Zhang",
+ "Xiaohui Yu",
+ "Tao Sun",
+ "Peng Zhao",
+ ]
+ assert meta.container_name == "PLOS ONE"
+ assert meta.container_abbrev == "PLOS ONE"
+ # "Apr 22, 2019"
+ assert meta.release_date == datetime.date(year=2019, month=4, day=22)
+ assert meta.first_page == "e0213978"
+ assert meta.issue == "4"
+ assert meta.volume == "14"
+ assert meta.container_issn == "1932-6203"
+ assert meta.publisher == "Public Library of Science"
+ assert (
+ meta.raw_references
+ and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;"
+ in meta.raw_references
+ )
+ assert meta.release_type == "article-journal"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ )
+
+
+def test_html_metadata_elife() -> None:
+
+ with open("tests/files/elife_article.html", "r") as f:
+ elife_html = f.read()
+
+ meta = html_extract_biblio(
+ "https://elifesciences.org/articles/44753", HTMLParser(elife_html)
+ )
+ assert meta is not None
+ assert meta.title == "Parallel visual circuitry in a basal chordate"
+ assert meta.doi == "10.7554/eLife.44753"
+ assert meta.contrib_names == [
+ "Matthew J Kourakis",
+ "Cezar Borba",
+ "Angela Zhang",
+ "Erin Newman-Smith",
+ "Priscilla Salas",
+ "B Manjunath",
+ "William C Smith",
+ ]
+ assert meta.container_name == "eLife"
+ # 2019-04-18
+ assert meta.release_date == datetime.date(year=2019, month=4, day=18)
+ assert meta.publisher == "eLife Sciences Publications Limited"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
+ )
+
+
+def test_html_metadata_peerj() -> None:
+
+ with open("tests/files/peerj_oa_article.html", "r") as f:
+ peerj_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(peerj_html))
+ assert meta is not None
+ assert (
+ meta.title
+ == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
+ )
+ assert meta.doi == "10.7717/peerj.4375"
+ assert meta.contrib_names == [
+ "Heather Piwowar",
+ "Jason Priem",
+ "Vincent Larivière",
+ "Juan Pablo Alperin",
+ "Lisa Matthias",
+ "Bree Norlander",
+ "Ashley Farley",
+ "Jevin West",
+ "Stefanie Haustein",
+ ]
+ assert meta.container_name == "PeerJ"
+ # "2018-02-13"
+ assert meta.release_date == datetime.date(year=2018, month=2, day=13)
+ assert meta.xml_fulltext_url and ".xml" in meta.xml_fulltext_url
+
+
+def test_html_metadata_nature() -> None:
+
+ with open("tests/files/nature_article.html", "r") as f:
+ nature_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(nature_html))
+ assert meta is not None
+ assert meta.title == "More than 100 scientific journals have disappeared from the Internet"
+ assert meta.doi == "10.1038/d41586-020-02610-z"
+ assert meta.contrib_names == [
+ "Diana Kwon",
+ ]
+ assert meta.container_name == "Nature"
+ # "2020-09-10"
+ assert meta.release_date == datetime.date(year=2020, month=9, day=10)
+ assert meta.publisher == "Nature Publishing Group"
+ # note: some error in dublin code in nature HTML resulting in duplication
+ assert (
+ meta.abstract
+ == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."
+ )
+
+
+def test_html_metadata_ojs3() -> None:
+
+ with open("tests/files/first_monday_ojs3_landingpage.html", "r") as f:
+ ojs3_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(ojs3_html))
+ assert meta is not None
+ assert meta.title == "Surveillance, stigma & sociotechnical design for HIV"
+ assert meta.doi == "10.5210/fm.v25i10.10274"
+ assert meta.contrib_names == [
+ "Calvin Liang",
+ "Jevan Alexander Hutson",
+ "Os Keyes",
+ ]
+ assert meta.container_name == "First Monday"
+ assert meta.container_abbrev == "1" # NOTE: bad source metadata
+ assert meta.container_issn == "1396-0466"
+ # "2020/09/10"
+ assert meta.release_date == datetime.date(year=2020, month=9, day=10)
+ assert meta.lang == "en"
+ assert (
+ meta.abstract
+ == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work†for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."
+ )
+ assert (
+ meta.html_fulltext_url
+ == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"
+ )
+ assert meta.release_type == "article-journal"
+
+
+def test_html_metadata_dlib() -> None:
+
+ with open("tests/files/dlib_05vanhyning.html", "r") as f:
+ dlib_html = f.read()
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(dlib_html))
+ assert meta is not None
+ assert meta.doi == "10.1045/may2017-vanhyning"
+ # "2017-05-15"
+ assert meta.release_date == datetime.date(year=2017, month=5, day=15)
+
+
+def test_html_metadata_dc_case() -> None:
+ """
+ This tests that CSS selector <meta name=""> attribute lookups are not case-sensitive.
+ """
+
+ snippet = """
+ <html>
+ <head>
+ <meta name="DC.Citation.Issue" content="123"/>
+ </head>
+ <body>Hi.</body>
+ </html>"""
+
+ meta = html_extract_biblio("http://example.org", HTMLParser(snippet))
+ assert meta is not None
+ assert meta.issue == "123"
+
+
+@pytest.fixture
+def adblock() -> Any:
+ return load_adblock_rules()
+
+
+def test_html_resources(adblock) -> None:
+
+ with open("tests/files/dlib_05vanhyning.html", "r") as f:
+ dlib_html = f.read()
+
+ resources = html_extract_resources(
+ "http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html",
+ HTMLParser(dlib_html),
+ adblock,
+ )
+
+ assert dict(url="http://www.dlib.org/style/style1.css", type="stylesheet") in resources
+
+ # check that adblock working
+ for r in resources:
+ assert "/ga.js" not in r["url"]
+
+ with open("tests/files/plos_one_article.html", "r") as f:
+ plos_html = f.read()
+
+ resources = html_extract_resources(
+ "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
+ HTMLParser(plos_html),
+ adblock,
+ )
+
+ # check that custom adblock working
+ for r in resources:
+ assert "crossmark-cdn.crossref.org" not in r["url"]
+
+ with open("tests/files/first_monday_ojs3_landingpage.html", "r") as f:
+ monday_html = f.read()
+
+ resources = html_extract_resources(
+ "https://firstmonday.org/blah/",
+ HTMLParser(monday_html),
+ adblock,
+ )
+
+ with open("tests/files/elife_article.html", "r") as f:
+ elife_html = f.read()
+
+ resources = html_extract_resources(
+ "https://elife.org/blah/",
+ HTMLParser(elife_html),
+ adblock,
+ )
+
+ with open("tests/files/nature_article.html", "r") as f:
+ nature_html = f.read()
+
+ resources = html_extract_resources(
+ "https://nature.com/blah/",
+ HTMLParser(nature_html),
+ adblock,
+ )
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
new file mode 100644
index 0000000..e14a452
--- /dev/null
+++ b/python/tests/test_ingest.py
@@ -0,0 +1,264 @@
+import json
+
+import pytest
+import responses
+from test_grobid import REAL_TEI_XML
+from test_savepagenow import *
+from test_wayback import *
+
+from sandcrawler import *
+
+
+@pytest.fixture
+def ingest_worker(wayback_client, spn_client):
+ grobid_client = GrobidClient(
+ host_url="http://dummy-grobid",
+ )
+ worker = IngestFileWorker(
+ wayback_client=wayback_client,
+ spn_client=spn_client,
+ grobid_client=grobid_client,
+ )
+ return worker
+
+
+@pytest.fixture
+def ingest_worker_pdf(wayback_client_pdf, spn_client):
+ grobid_client = GrobidClient(
+ host_url="http://dummy-grobid",
+ )
+ pgrest_client = SandcrawlerPostgrestClient(
+ api_url="http://dummy-postgrest",
+ )
+ worker = IngestFileWorker(
+ wayback_client=wayback_client_pdf,
+ spn_client=spn_client,
+ grobid_client=grobid_client,
+ pgrest_client=pgrest_client,
+ )
+ return worker
+
+
+@responses.activate
+def test_ingest_success(ingest_worker_pdf):
+
+ with open("tests/files/dummy.pdf", "rb") as f:
+ pdf_bytes = f.read()
+
+ request = {
+ "ingest_type": "pdf",
+ "base_url": "http://dummy-host/",
+ }
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=pdf_bytes,
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-postgrest/grobid?sha1hex=eq.{}".format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"
+ ),
+ status=200,
+ body=json.dumps([]),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-postgrest/pdf_meta?sha1hex=eq.{}".format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"
+ ),
+ status=200,
+ body=json.dumps([]),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
+
+ resp = ingest_worker_pdf.process(request)
+
+ print(resp)
+ assert resp["hit"] is True
+ assert resp["status"] == "success"
+ assert resp["request"] == request
+ assert resp["terminal"]["terminal_sha1hex"] == resp["file_meta"]["sha1hex"]
+ assert type(resp["terminal"]["terminal_dt"]) == str
+ assert resp["terminal"]["terminal_url"] == TARGET + "/redirect"
+ assert resp["terminal"]["terminal_status_code"]
+ assert type(resp["file_meta"]["size_bytes"]) == int
+ assert resp["file_meta"]["mimetype"] == "application/pdf"
+ assert resp["cdx"]["url"] == TARGET + "/redirect"
+ assert "warc_path" not in resp["cdx"]
+ assert "revisit_cdx" not in resp
+ assert resp["grobid"]["status"] == "success"
+ assert resp["grobid"]["status_code"] == 200
+ assert resp["grobid"]["grobid_version"]
+ assert "fatcat_release" in resp["grobid"]
+ assert "grobid_version" not in resp["grobid"]["metadata"]
+ assert "fatcat_release" not in resp["grobid"]["metadata"]
+ assert "tei_xml" not in resp["grobid"]
+ assert resp["pdf_meta"]["status"] == "success"
+ assert resp["pdf_meta"]["pdf_extra"]["page_count"] == 1
+ assert resp["pdf_meta"].get("text") is None
+
+
+@responses.activate
+def test_ingest_landing(ingest_worker):
+
+ request = {
+ "ingest_type": "pdf",
+ "base_url": "http://dummy-host/",
+ }
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=WARC_BODY,
+ )
+
+ # this is for second time around; don't want to fetch same landing page
+ # HTML again and result in a loop
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body="<html></html>",
+ )
+
+ resp = ingest_worker.process(request)
+
+ print(resp)
+ assert resp["hit"] is False
+ assert resp["status"] == "no-pdf-link"
+ assert resp["request"] == request
+ assert "terminal" in resp
+ assert "file_meta" not in resp
+ assert "cdx" not in resp
+ assert "revisit_cdx" not in resp
+ assert "grobid" not in resp
+
+
+@responses.activate
+def test_ingest_blocklist(ingest_worker):
+
+ ingest_worker.base_url_blocklist = [
+ "://test.fatcat.wiki/",
+ ]
+ request = {
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/asdfasdf.pdf",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp["hit"] is False
+ assert resp["status"] == "skip-url-blocklist"
+ assert resp["request"] == request
+
+
+@responses.activate
+def test_ingest_wall_blocklist(ingest_worker):
+
+ ingest_worker.wall_blocklist = [
+ "://test.fatcat.wiki/",
+ ]
+ request = {
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/asdfasdf.pdf",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp["hit"] is False
+ assert resp["status"] == "skip-wall"
+ assert resp["request"] == request
+
+
+@responses.activate
+def test_ingest_cookie_blocklist(ingest_worker):
+
+ request = {
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/cookieAbsent",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp["hit"] is False
+ assert resp["status"] == "blocked-cookie"
+ assert resp["request"] == request
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
new file mode 100644
index 0000000..9bd8b5f
--- /dev/null
+++ b/python/tests/test_live_wayback.py
@@ -0,0 +1,181 @@
+"""
+This file contains tests to run against "live" wayback services. They default
+to "skip" because you need authentication, and we shouldn't hit these services
+automatically in CI.
+
+Simply uncomment lines to run.
+"""
+
+import pytest
+
+from sandcrawler import CdxApiClient, SavePageNowClient, WaybackClient, gen_file_metadata
+
+
+@pytest.fixture
+def cdx_client():
+ client = CdxApiClient()
+ return client
+
+
+@pytest.fixture
+def wayback_client():
+ client = WaybackClient()
+ return client
+
+
+@pytest.fixture
+def spn_client():
+ client = SavePageNowClient()
+ return client
+
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_cdx_fetch(cdx_client):
+
+ # org,plos,journals)/plosone/article?id=10.1371/journal.pone.0093949 20181105121428 https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0093949 text/html 200 OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV - - 25338 240665973 MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz
+
+ url = "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0093949"
+ datetime = "20181105121428"
+ resp = cdx_client.fetch(url, datetime)
+
+ assert resp.url == url
+ assert resp.datetime == datetime
+ assert resp.sha1b32 == "OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV"
+ assert resp.warc_csize == 25338
+ assert resp.warc_offset == 240665973
+ assert (
+ resp.warc_path
+ == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz"
+ )
+
+ # bogus datetime; shouldn't match
+ with pytest.raises(KeyError):
+ resp = cdx_client.fetch(url, "12345678123456")
+
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_cdx_lookup_best(cdx_client):
+
+ url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
+ resp = cdx_client.lookup_best(url, best_mimetype="application/pdf")
+
+ # won't know datetime, hash, etc
+ assert resp.url in (url, url.replace("https://", "http://"))
+ assert resp.mimetype == "application/pdf"
+ assert resp.status_code == 200
+
+ url = "https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.gu33570g87v71007"
+ resp = cdx_client.lookup_best(url, best_mimetype="application/pdf")
+
+ assert resp.url in (url, url.replace("https://", "http://"))
+ assert resp.mimetype == "text/html"
+ assert resp.status_code == 200
+
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_wayback_fetch(wayback_client):
+
+ resp = wayback_client.fetch_petabox(
+ 25683,
+ 2676464871,
+ "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz",
+ )
+
+ assert resp.body
+
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_lookup_resource_success(wayback_client):
+
+ url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
+ resp = wayback_client.lookup_resource(url)
+
+ assert resp.hit is True
+ assert resp.status == "success"
+ assert resp.terminal_url in (url, url.replace("https://", "http://"))
+ assert resp.cdx.url in (url, url.replace("https://", "http://"))
+
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_cdx_fetch_spn2(cdx_client):
+
+ # https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424 20200110210133
+
+ # com,elsevier,linkinghub)/retrieve/pii/s2590109519300424 20191201203206 https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424 text/html 200 FPXVUJR7RXVGO6RIY5HYB6JVT7OD53SG - - 5026 364192270 liveweb-20191201204645/live-20191201195942-wwwb-app52.us.archive.org.warc.gz
+ # com,elsevier,linkinghub)/retrieve/pii/s2590109519300424 20200110210044 https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424 text/html 200 OIQ3TKPBQLYYXQDIG7D2ZOK7IJEUEAQ7 - - 5130 710652442 liveweb-20200110204521-wwwb-spn20.us.archive.org-8001.warc.gz
+ # com,elsevier,linkinghub)/retrieve/pii/s2590109519300424 20200110210133 https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424 text/html 200 G2MSFAYELECMFGKTYEHUN66WWNW4HXKQ - - 5126 544508422 liveweb-20200110205247-wwwb-spn01.us.archive.org-8000.warc.gz
+
+ url = "https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424"
+ datetime = "20200110210133"
+ resp = cdx_client.fetch(url, datetime, filter_status_code=200)
+
+ assert resp.url == url
+ assert resp.datetime == datetime
+ assert resp.sha1b32 == "G2MSFAYELECMFGKTYEHUN66WWNW4HXKQ"
+ assert resp.status_code == 200
+
+ # https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 20200110222410
+
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
+
+ url = "https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209"
+ datetime = "20200110222410"
+ resp = cdx_client.fetch(url, datetime, filter_status_code=200)
+
+ assert resp.url == url
+ assert resp.datetime == datetime
+ assert resp.sha1b32 == "VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL"
+ assert resp.status_code == 200
+
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_lookup_ftp(wayback_client):
+ # ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/80/23/10.1177_1559827617708562.PMC6236633.pdf
+ # ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf
+ # ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf
+
+ # revisit!
+ url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
+ resp = wayback_client.lookup_resource(url)
+
+ assert resp.hit is True
+ assert resp.status == "success"
+ assert resp.terminal_url == url
+ assert resp.terminal_status_code in (226, 200)
+ assert resp.cdx.url == url
+ assert resp.revisit_cdx
+ assert resp.revisit_cdx.url != url
+
+ file_meta = gen_file_metadata(resp.body)
+ assert file_meta["sha1hex"] == resp.cdx.sha1hex
+
+ # not revisit?
+ url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf"
+ resp = wayback_client.lookup_resource(url)
+
+ assert resp.hit is True
+ assert resp.status == "success"
+ assert resp.terminal_url == url
+ assert resp.terminal_status_code in (226, 200)
+ assert resp.cdx.url == url
+
+ file_meta = gen_file_metadata(resp.body)
+ assert file_meta["sha1hex"] == resp.cdx.sha1hex
+
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_crawl_ftp(spn_client, wayback_client):
+
+ url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
+ resp = spn_client.crawl_resource(url, wayback_client)
+
+ # FTP isn't supported yet!
+ # assert resp.hit is True
+ # assert resp.status == "success"
+ # assert resp.terminal_url == url
+ # assert resp.cdx.url == url
+
+ assert resp.hit is False
+ assert resp.status == "spn2-no-ftp"
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 420bc07..2bad851 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,71 +1,110 @@
-
import pytest
-from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line
+from sandcrawler import (
+ b32_hex,
+ clean_url,
+ gen_file_metadata,
+ gen_file_metadata_path,
+ parse_cdx_line,
+)
+
def test_gen_file_metadata():
-
+
# valid (but very small) PDF file
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
file_meta = gen_file_metadata(f.read())
assert file_meta == {
- 'mimetype': 'application/pdf',
- 'md5hex': '2942bfabb3d05332b66eb128e0842cff',
- 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36',
- 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4',
- 'size_bytes': 13264,
+ "mimetype": "application/pdf",
+ "md5hex": "2942bfabb3d05332b66eb128e0842cff",
+ "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
+ "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
+ "size_bytes": 13264,
}
# valid HTML
fm = gen_file_metadata(
- b"""<html><head><title>dummy</title></head><body>html document</body></html>""")
- assert fm['mimetype'] == 'text/html'
+ b"""<html><head><title>dummy</title></head><body>html document</body></html>"""
+ )
+ assert fm["mimetype"] == "text/html"
# bogus text
fm = gen_file_metadata(b"asdf1234")
- assert fm['mimetype'] == 'text/plain'
- assert fm['size_bytes'] == 8
+ assert fm["mimetype"] == "text/plain"
+ assert fm["size_bytes"] == 8
+
+
+def test_gen_file_metadata_path():
+
+ # valid (but very small) PDF file
+ file_meta = gen_file_metadata_path("tests/files/dummy.pdf")
+ assert file_meta == {
+ "mimetype": "application/pdf",
+ "md5hex": "2942bfabb3d05332b66eb128e0842cff",
+ "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
+ "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
+ "size_bytes": 13264,
+ }
+
def test_b32_hex():
# valid b32
- assert b32_hex('sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
- assert b32_hex('TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+ assert (
+ b32_hex("sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
+ == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
+ )
+ assert (
+ b32_hex("TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
+ == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
+ )
# sha1hex pass-through
- s = 'bda3c1017d52e826bbd1da51efad877272d300f9'
+ s = "bda3c1017d52e826bbd1da51efad877272d300f9"
assert b32_hex(s) == s
# invalid
with pytest.raises(ValueError):
- assert b32_hex('blah') == 'blah'
+ assert b32_hex("blah") == "blah"
+
def test_parse_cdx_line():
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
correct = {
- 'sha1b32': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
- 'sha1hex': "b2f65203da9929c2f758e8dd587b5524f904dbe6",
- 'mimetype': "application/pdf",
- 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'datetime': "20170828233154",
- 'warc_path': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
- 'warc_offset': 931661233,
- 'warc_csize': 210251,
- 'http_status': 200,
+ "sha1b32": "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+ "sha1hex": "b2f65203da9929c2f758e8dd587b5524f904dbe6",
+ "mimetype": "application/pdf",
+ "surt": "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ "url": "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ "datetime": "20170828233154",
+ "warc_path": "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ "warc_offset": 931661233,
+ "warc_csize": 210251,
+ "http_status": 200,
}
assert parse_cdx_line(raw) == correct
assert parse_cdx_line(raw + "\n") == correct
assert parse_cdx_line(raw + " extra_field") == correct
+
def test_invalid_cdx():
print("missing warc")
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
- assert parse_cdx_line(raw) == None
+ assert parse_cdx_line(raw) is None
print("bad datetime")
- raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
- assert parse_cdx_line(raw) == None
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ assert parse_cdx_line(raw) is None
+
+
+def test_clean_url():
+ assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
+ assert (
+ clean_url(
+ "https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view"
+ )
+ == "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
+ )
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
new file mode 100644
index 0000000..9d75655
--- /dev/null
+++ b/python/tests/test_pdfextract.py
@@ -0,0 +1,71 @@
+import struct
+
+import poppler
+import pytest
+from test_wayback import cdx_client, wayback_client # noqa:F401
+
+from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker
+from sandcrawler.pdfextract import process_pdf
+
+FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
+
+
+def test_process_fake_pdf():
+ resp = process_pdf(FAKE_PDF_BYTES)
+ print(resp)
+ assert resp.status == "not-pdf"
+
+ with open("tests/files/dummy_zip.zip", "rb") as f:
+ pdf_bytes = f.read()
+ resp = process_pdf(pdf_bytes)
+ assert resp.status == "not-pdf"
+
+
+@pytest.mark.skipif(
+ poppler.version_string() == "0.71.0", reason="unsupported version of poppler"
+)
+def test_process_dummy_pdf():
+ with open("tests/files/dummy.pdf", "rb") as f:
+ pdf_bytes = f.read()
+ resp = process_pdf(pdf_bytes)
+ assert resp.status == "success"
+ assert resp.page0_thumbnail is not None
+ assert len(resp.text) > 10
+ assert resp.meta_xml is None
+ assert resp.file_meta["mimetype"] == "application/pdf"
+ print(resp.pdf_info)
+ print(resp.pdf_extra)
+ assert resp.pdf_info["Author"] == "Evangelos Vlachogiannis"
+ # 595 x 842
+ assert resp.pdf_extra["page0_height"] == 842
+ assert resp.pdf_extra["page0_width"] == 595
+ assert resp.pdf_extra["page_count"] == 1
+
+
+def test_pdfextract_worker_cdx(wayback_client): # noqa: F811
+
+ sink = BlackholeSink()
+ worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
+
+ with open("tests/files/example.cdx", "r") as cdx_file:
+ pusher = CdxLinePusher(
+ worker,
+ cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=["application/pdf"],
+ )
+ pusher_counts = pusher.run()
+ assert pusher_counts["total"]
+ assert pusher_counts["pushed"] == 7
+ assert pusher_counts["pushed"] == worker.counts["total"]
+
+
+def test_pdfextract_blob_worker():
+
+ sink = BlackholeSink()
+ worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink)
+
+ with open("tests/files/dummy.pdf", "rb") as f:
+ pdf_bytes = f.read()
+
+ worker.process(pdf_bytes)
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index ed9c0bb..ed17d24 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -1,7 +1,4 @@
-
-import pytest
-
-from sandcrawler.workers import CdxLinePusher, BlackholeSink
+from sandcrawler.workers import BlackholeSink, CdxLinePusher
def test_cdx_line_pusher():
@@ -9,20 +6,24 @@ def test_cdx_line_pusher():
sink = BlackholeSink()
# vanilla (only default filters)
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(sink, cdx_file)
counts = pusher.run()
- assert counts['total'] == 20
- assert counts['skip-parse'] == 1
- assert counts['pushed'] == 19
+ assert counts["total"] == 20
+ assert counts["skip-parse"] == 1
+ assert counts["pushed"] == 19
# HTTP 200 and application/pdf
- with open('tests/files/example.cdx', 'r') as cdx_file:
- pusher = CdxLinePusher(sink, cdx_file,
- filter_mimetypes=['application/pdf'], filter_http_statuses=[200])
+ with open("tests/files/example.cdx", "r") as cdx_file:
+ pusher = CdxLinePusher(
+ sink,
+ cdx_file,
+ filter_mimetypes=["application/pdf"],
+ filter_http_statuses=[200, 226],
+ )
counts = pusher.run()
- assert counts['total'] == 20
- assert counts['skip-parse'] == 1
- assert counts['skip-http_status'] == 10
- assert counts['skip-mimetype'] == 2
- assert counts['pushed'] == 7
+ assert counts["total"] == 20
+ assert counts["skip-parse"] == 1
+ assert counts["skip-http_status"] == 10
+ assert counts["skip-mimetype"] == 2
+ assert counts["pushed"] == 7
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
new file mode 100644
index 0000000..add2c60
--- /dev/null
+++ b/python/tests/test_savepagenow.py
@@ -0,0 +1,331 @@
+import json
+
+import pytest
+import responses
+from test_wayback import *
+
+from sandcrawler import CdxPartial, SavePageNowBackoffError, SavePageNowClient, SavePageNowError
+
+TARGET = "http://dummy-target.dummy"
+JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8"
+PENDING_BODY = {
+ "status": "pending",
+ "job_id": JOB_ID,
+ "resources": [
+ "https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js",
+ "https://ajax.googleapis.com/ajax/libs/jqueryui/1.8.21/jquery-ui.min.js",
+ "https://cdn.onesignal.com/sdks/OneSignalSDK.js",
+ ],
+}
+SUCCESS_BODY = {
+ "status": "success",
+ "job_id": JOB_ID,
+ "original_url": TARGET + "/redirect",
+ "screenshot": "http://web.archive.org/screenshot/http://brewster.kahle.org/",
+ "timestamp": "20180326070330",
+ "duration_sec": 6.203,
+ "resources": [
+ TARGET,
+ TARGET + "/redirect",
+ "http://brewster.kahle.org/",
+ "http://brewster.kahle.org/favicon.ico",
+ "http://brewster.kahle.org/files/2011/07/bkheader-follow.jpg",
+ "http://brewster.kahle.org/files/2016/12/amazon-unhappy.jpg",
+ "http://brewster.kahle.org/files/2017/01/computer-1294045_960_720-300x300.png",
+ "http://brewster.kahle.org/files/2017/11/20thcenturytimemachineimages_0000.jpg",
+ "http://brewster.kahle.org/files/2018/02/IMG_6041-1-300x225.jpg",
+ "http://brewster.kahle.org/files/2018/02/IMG_6061-768x1024.jpg",
+ "http://brewster.kahle.org/files/2018/02/IMG_6103-300x225.jpg",
+ "http://brewster.kahle.org/files/2018/02/IMG_6132-225x300.jpg",
+ "http://brewster.kahle.org/files/2018/02/IMG_6138-1-300x225.jpg",
+ "http://brewster.kahle.org/wp-content/themes/twentyten/images/wordpress.png",
+ "http://brewster.kahle.org/wp-content/themes/twentyten/style.css",
+ "http://brewster.kahle.org/wp-includes/js/wp-embed.min.js?ver=4.9.4",
+ "http://brewster.kahle.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4",
+ "http://platform.twitter.com/widgets.js",
+ "https://archive-it.org/piwik.js",
+ "https://platform.twitter.com/jot.html",
+ "https://platform.twitter.com/js/button.556f0ea0e4da4e66cfdc182016dbd6db.js",
+ "https://platform.twitter.com/widgets/follow_button.f47a2e0b4471326b6fa0f163bda46011.en.html",
+ "https://syndication.twitter.com/settings",
+ "https://www.syndikat.org/en/joint_venture/embed/",
+ "https://www.syndikat.org/wp-admin/images/w-logo-blue.png",
+ "https://www.syndikat.org/wp-content/plugins/user-access-manager/css/uamAdmin.css?ver=1.0",
+ "https://www.syndikat.org/wp-content/plugins/user-access-manager/css/uamLoginForm.css?ver=1.0",
+ "https://www.syndikat.org/wp-content/plugins/user-access-manager/js/functions.js?ver=4.9.4",
+ "https://www.syndikat.org/wp-content/plugins/wysija-newsletters/css/validationEngine.jquery.css?ver=2.8.1",
+ "https://www.syndikat.org/wp-content/uploads/2017/11/s_miete_fr-200x116.png",
+ "https://www.syndikat.org/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1",
+ "https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4",
+ "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4",
+ ],
+ "outlinks": {
+ "https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695",
+ "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695",
+ },
+}
+ERROR_BODY = {
+ "status": "error",
+ "exception": "[Errno -2] Name or service not known",
+ "status_ext": "error:invalid-host-resolution",
+ "job_id": JOB_ID,
+ "message": "Couldn't resolve host for http://example5123.com.",
+ "resources": [],
+}
+CDX_SPN_HIT = [
+ [
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
+ ],
+ [
+ "wiki,fatcat)/",
+ "20180326070330",
+ TARGET + "/redirect",
+ "application/pdf",
+ "200",
+ CDX_BEST_SHA1B32,
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz",
+ ],
+]
+
+
+@pytest.fixture
+def spn_client():
+ client = SavePageNowClient(
+ v2endpoint="http://dummy-spnv2/save",
+ ia_access_key="dummy-access-key",
+ ia_secret_key="dummy-secret-key",
+ )
+ client.poll_seconds = 0.0
+ return client
+
+
+@responses.activate
+def test_savepagenow_success(spn_client):
+
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY),
+ )
+
+ resp = spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 5
+
+ assert resp.success is True
+ assert resp.status == "success"
+ assert resp.request_url == TARGET
+ assert resp.terminal_url == TARGET + "/redirect"
+ assert resp.terminal_dt == SUCCESS_BODY["timestamp"]
+ assert resp.resources == SUCCESS_BODY["resources"]
+
+
+@responses.activate
+def test_savepagenow_remote_error(spn_client):
+
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(ERROR_BODY),
+ )
+
+ resp = spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 4
+
+ assert resp.success is False
+ assert resp.status == ERROR_BODY["status_ext"]
+ assert resp.request_url == TARGET
+ assert resp.terminal_url is None
+ assert resp.terminal_dt is None
+ assert resp.resources is None
+
+
+@responses.activate
+def test_savepagenow_500(spn_client):
+
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=500,
+ body=json.dumps(ERROR_BODY),
+ )
+
+ with pytest.raises(SavePageNowError):
+ spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 3
+
+
+@responses.activate
+def test_savepagenow_no_slots(spn_client):
+
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 0,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+
+ with pytest.raises(SavePageNowBackoffError):
+ spn_client.save_url_now_v2(TARGET)
+
+ assert len(responses.calls) == 1
+
+
+@responses.activate
+def test_crawl_resource(spn_client, wayback_client):
+
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/user",
+ status=200,
+ body=json.dumps(
+ {
+ "available": 23,
+ "daily_captures": 60295,
+ "daily_captures_limit": 300000,
+ "processing": 1,
+ }
+ ),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=WARC_BODY,
+ )
+
+ print("https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"))
+ resp = spn_client.crawl_resource(TARGET, wayback_client)
+
+ assert len(responses.calls) == 6
+
+ assert resp.hit is True
+ assert resp.status == "success"
+ assert resp.body == WARC_BODY
+ assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32
+
+ assert type(resp.cdx) == CdxPartial
+ with pytest.raises(AttributeError):
+ print(resp.cdx.warc_path)
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
new file mode 100644
index 0000000..da4dfd8
--- /dev/null
+++ b/python/tests/test_wayback.py
@@ -0,0 +1,297 @@
+import json
+
+import pytest
+import responses
+
+from sandcrawler import CdxApiClient, WaybackClient
+
+CDX_TARGET = "http://fatcat.wiki/"
+CDX_DT = "20180812220054"
+# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
+CDX_SINGLE_HIT = [
+ [
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
+ ],
+ [
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+]
+
+CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
+# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
+CDX_MULTI_HIT = [
+ [
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
+ ],
+ [
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # sooner, but not right mimetype
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # sooner and mimetype, but wrong status code
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "400",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "500",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ [
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "150",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # "best"
+ [
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "application/pdf",
+ "200",
+ CDX_BEST_SHA1B32,
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+ # older
+ [
+ "wiki,fatcat)/",
+ "20180712220054",
+ CDX_TARGET,
+ "application/pdf",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
+ ],
+]
+
+
+@pytest.fixture
+def cdx_client():
+ client = CdxApiClient(
+ host_url="http://dummy-cdx/cdx",
+ cdx_auth_token="dummy-token",
+ )
+ return client
+
+
+@responses.activate
+def test_cdx_fetch(cdx_client):
+
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT)
+ )
+
+ resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
+
+ assert len(responses.calls) == 1
+
+ assert resp.datetime == CDX_DT
+ assert resp.url == CDX_TARGET
+ assert resp.sha1b32 == "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR"
+ assert resp.warc_csize == 8445
+ assert resp.warc_offset == 108062304
+ assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+
+
+@responses.activate
+def test_cdx_fetch_errors(cdx_client):
+
+ with pytest.raises(ValueError):
+ resp = cdx_client.fetch(CDX_TARGET, "2019")
+
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT)
+ )
+
+ with pytest.raises(KeyError):
+ resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
+
+ with pytest.raises(KeyError):
+ resp = cdx_client.fetch("http://some-other.com", CDX_DT)
+
+ resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
+ assert len(responses.calls) == 3
+ assert resp
+
+
+@responses.activate
+def test_cdx_lookup_best(cdx_client):
+
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT)
+ )
+
+ resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
+
+ assert len(responses.calls) == 1
+
+ assert resp.datetime == CDX_DT
+ assert resp.url == CDX_TARGET
+ assert resp.sha1b32 == CDX_BEST_SHA1B32
+ assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
+
+
+WARC_TARGET = "http://fatcat.wiki/"
+WARC_BODY = b"""
+<html>
+ <head>
+ <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf">
+ </head>
+ <body>
+ <h1>my big article here</h1>
+ blah
+ </body>
+</html>
+"""
+
+
+@pytest.fixture
+def wayback_client(cdx_client, mocker):
+ client = WaybackClient(
+ cdx_client=cdx_client,
+ petabox_webdata_secret="dummy-petabox-secret",
+ )
+ # mock out the wayback store with mock stuff
+ client.rstore = mocker.Mock()
+ resource = mocker.Mock()
+ client.rstore.load_resource = mocker.MagicMock(return_value=resource)
+ resource.get_status = mocker.MagicMock(return_value=(200, "Ok"))
+ resource.is_revisit = mocker.MagicMock(return_value=False)
+ resource.get_location = mocker.MagicMock(return_value=WARC_TARGET)
+ body = mocker.Mock()
+ resource.open_raw_content = mocker.MagicMock(return_value=body)
+ body.read = mocker.MagicMock(return_value=WARC_BODY)
+
+ return client
+
+
+@pytest.fixture
+def wayback_client_pdf(cdx_client, mocker):
+
+ with open("tests/files/dummy.pdf", "rb") as f:
+ pdf_bytes = f.read()
+
+ client = WaybackClient(
+ cdx_client=cdx_client,
+ petabox_webdata_secret="dummy-petabox-secret",
+ )
+ # mock out the wayback store with mock stuff
+ client.rstore = mocker.Mock()
+ resource = mocker.Mock()
+ client.rstore.load_resource = mocker.MagicMock(return_value=resource)
+ resource.get_status = mocker.MagicMock(return_value=(200, "Ok"))
+ resource.is_revisit = mocker.MagicMock(return_value=False)
+ resource.get_location = mocker.MagicMock(return_value=WARC_TARGET)
+ body = mocker.Mock()
+ resource.open_raw_content = mocker.MagicMock(return_value=body)
+ body.read = mocker.MagicMock(return_value=pdf_bytes)
+
+ return client
+
+
+@responses.activate
+def test_wayback_fetch(wayback_client):
+ resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz")
+ assert resp.body == WARC_BODY
+ assert resp.location == WARC_TARGET
+
+ resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz")
+ assert resp == WARC_BODY
+
+
+@responses.activate
+def test_lookup_resource_success(wayback_client):
+
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT)
+ )
+
+ resp = wayback_client.lookup_resource(CDX_TARGET)
+
+ assert resp.hit is True
diff --git a/python/tests/test_xml.py b/python/tests/test_xml.py
new file mode 100644
index 0000000..786f863
--- /dev/null
+++ b/python/tests/test_xml.py
@@ -0,0 +1,17 @@
+import pytest
+
+from sandcrawler.xml import xml_reserialize
+
+
+def test_xml_reserialize() -> None:
+
+ with open("tests/files/scielo_article.jats.xml", "rb") as f:
+ raw_xml = f.read()
+
+ assert b'encoding="ISO-8859-1"' in raw_xml
+ raw_xml.decode("ISO-8859-1")
+ with pytest.raises(UnicodeDecodeError):
+ raw_xml.decode("utf-8")
+
+ str_xml = xml_reserialize(raw_xml)
+ assert 'encoding="UTF-8"' in str_xml
diff --git a/python/title_slug_blacklist.txt b/python/title_slug_denylist.txt
index 5bca386..5bca386 120000
--- a/python/title_slug_blacklist.txt
+++ b/python/title_slug_denylist.txt
diff --git a/python_hadoop/README.md b/python_hadoop/README.md
index 198c949..7866480 100644
--- a/python_hadoop/README.md
+++ b/python_hadoop/README.md
@@ -68,7 +68,7 @@ running on a devbox and GROBID running on a dedicated machine:
./extraction_cdx_grobid.py \
--hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
--grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
tests/files/example.cdx
@@ -76,7 +76,7 @@ Running from the cluster (once a ./venv-current.tar.gz tarball exists):
./extraction_cdx_grobid.py \
--hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
--grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
-r hadoop \
-c mrjob.conf \
@@ -90,13 +90,13 @@ running on a devbox:
./backfill_hbase_from_cdx.py \
--hbase-table wbgrp-journal-extract-0-qa \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
tests/files/example.cdx
Running from the cluster (once a ./venv-current.tar.gz tarball exists):
./backfill_hbase_from_cdx.py \
- --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-host wbgrp-svc350.us.archive.org \
--hbase-table wbgrp-journal-extract-0-qa \
-r hadoop \
-c mrjob.conf \
diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
new file mode 100644
index 0000000..963fb10
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala
@@ -0,0 +1,187 @@
+package sandcrawler
+
+import java.util.Properties
+
+import scala.util.Try
+import scala.util.matching.Regex
+import scala.util.parsing.json.JSONObject
+
+import cascading.pipe.joiner._
+import cascading.property.AppProps
+import cascading.tap.SinkMode
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import parallelai.spyglass.base.JobBase
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+// Type that represents a raw parsed CDX line
+case class CdxLine(surt: String, datetime: String, url: String, mime: String, httpStatus: String, sha1: String, c_size: String, offset: String, warc: String)
+
+/**
+ * CDX backfill:
+ * 1. parse CDX (all columns)
+ * 2. filter CDX (pdf, HTTP 200, etc)
+ * 3. source HBase (key column only)
+ * 4. left join CDX to HBase
+ * 5. filter to only those with null HBase key column
+ * 6. convert CDX fields to HBase columns
+ * 7. sink results to HBase
+ */
+class CdxBackfillJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+
+ import CdxBackfillJob._
+
+ val hbaseSource = getHBaseSource(args("hbase-table"), args("zookeeper-hosts"))
+ val hbaseSink = getHBaseSink(args("hbase-table"), args("zookeeper-hosts"))
+
+ // Parse CDX lines from text file to typed pipe
+ val lines : TypedPipe[String] = TypedPipe.from(TextLine(args("cdx-input-path")))
+
+ val cdxLines : TypedPipe[CdxLine] = lines
+ .filter { isCdxLine }
+ .map { lineToCdxLine }
+ .filter { CdxBackfillJob.keepCdx(_) }
+
+ // (key, f:c, file:cdx, file:mime)
+ val cdxRows : TypedPipe[(String, String, String, String)] = cdxLines
+ .map { CdxBackfillJob.cdxLineToRow }
+ .debug
+
+ val existingKeys : TypedPipe[String] = hbaseSource
+ .read
+ .fromBytesWritable( new Fields("key") )
+ .toTypedPipe[String]('key)
+ //.debug
+
+ // filters out all the lines that have an existing SHA1 key in HBase
+ // the groupBy statements are to select key values to join on.
+ // (key, f:c, file:cdx, file:mime)
+ val newRows : TypedPipe[(String, String, String, String)] = existingKeys
+ .groupBy( identity )
+ .rightJoin(cdxRows.groupBy(_._1))
+ .toTypedPipe
+ .collect { case (_, (None, row)) => row }
+ .debug
+
+ // convert to tuple form and write out into HBase
+ newRows
+ .toPipe('key, 'c, 'cdx, 'mime)
+ .toBytesWritable( new Fields("key", "c", "cdx", "mime") )
+ .write(hbaseSink)
+
+}
+
+object CdxBackfillJob {
+
+ def getHBaseSource(hbase_table: String, zookeeper_hosts: String) : HBaseSource = {
+ HBaseBuilder.build(
+ hbase_table,
+ zookeeper_hosts,
+ List("file:size"), // not actually needed
+ SourceMode.SCAN_ALL)
+ }
+
+ def getHBaseSink(hbase_table: String, zookeeper_hosts: String) : HBaseSource = {
+ HBaseBuilder.buildSink(
+ hbase_table,
+ zookeeper_hosts,
+ List("f:c", "file:cdx", "file:mime"),
+ SinkMode.UPDATE)
+ }
+
+ def normalizeMime(raw: String) : String = {
+
+ val normalMime = Map(
+ "application/pdf" -> "application/pdf",
+ "application/x-pdf" -> "application/pdf",
+ "('application/pdf'" -> "application/pdf",
+ "image/pdf" -> "application/pdf",
+ "text/pdf" -> "application/pdf",
+ "\"application/pdf\"" -> "application/pdf",
+ "application/postscript" -> "application/postscript",
+ "text/html" -> "text/html",
+ "text/xml" -> "text/xml",
+ "application/xml" -> "text/xml"
+ )
+
+ val lower = raw.toLowerCase()
+ normalMime.find { case (key, _) =>
+ lower.startsWith(key)
+ } match {
+ case Some((_, value)) => value
+ case None => lower
+ }
+ }
+
+ def isCdxLine(line: String) : Boolean = {
+ // malformatted or non-CDX11 lines
+ !(line.startsWith("#") || line.startsWith(" ") || line.startsWith("filedesc") ||
+ line.split(" ").size != 11)
+ }
+
+ def keepCdx(line: CdxLine) : Boolean = {
+ val sha1Pattern = """[A-Z2-7]{32}""".r
+ if (List(line.surt, line.datetime, line.url, line.mime, line.c_size, line.offset, line.warc).contains("-")) {
+ false
+ } else if (line.httpStatus != "200") {
+ false
+ } else if (line.mime != "application/pdf") {
+ false
+ } else if (sha1Pattern.unapplySeq(line.sha1).isEmpty) {
+ false
+ } else if (List(line.c_size, line.offset, line.datetime).map(s => Try(s.toLong).toOption).contains(None)) {
+ false
+ } else {
+ true
+ }
+ }
+
+ // Returns (key, f:c, file:cdx, file:mime), all as strings, which is close to
+ // how they will be inserted into HBase
+ def cdxLineToRow(line: CdxLine) : (String, String, String, String) = {
+
+ val key = "sha1:" + line.sha1
+
+ val warcFile = line.warc.split('/')(1)
+
+ // Read CDX-style datetime and conver to ISO 8601 with second resolution
+ val dtFormat = new java.text.SimpleDateFormat("yyyyMMddHHmmss")
+ val isoFormat = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'")
+ // TODO: timezones? UTC to UTC, so I don't think so.
+ val dtIso = isoFormat.format(dtFormat.parse(line.datetime))
+
+ // This is the "f:c" field. 'i' intentionally not set
+ // python: f:c = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1)
+ // python: warc_file = warc.split('/')[-1]
+ // python: dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
+ val heritrixInfo = JSONObject(Map(
+ "u" -> line.url,
+ "d" -> dtIso,
+ "f" -> warcFile,
+ "o" -> line.offset.toInt,
+ "c" -> line.c_size.toInt
+ ))
+
+ // python: dict(surt=surt, dt=dt, url=url, c_size=int(c_size),
+ // offset=int(offset), warc=warc)
+ val fileCdx = JSONObject(Map(
+ "surt" -> line.surt,
+ "dt" -> line.datetime,
+ "url" -> line.url,
+ "c_size" -> line.c_size.toInt,
+ "offset" -> line.offset.toInt,
+ "warc" -> line.warc
+ ))
+ (key, heritrixInfo.toString(), fileCdx.toString(), normalizeMime(line.mime))
+ }
+
+ def lineToCdxLine(line: String) : CdxLine = {
+ val raw = line.split("\\s+")
+ // surt, datetime, url, mime, http_status, sha1, SKIP, SKIP, c_size, offset, warc
+ CdxLine(raw(0), raw(1), raw(2), raw(3), raw(4), raw(5), raw(8), raw(9), raw(10))
+ }
+
+}
diff --git a/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala
new file mode 100644
index 0000000..c092f7f
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/CdxBackfillJob.scala
@@ -0,0 +1,175 @@
+
+package sandcrawler
+
+import org.scalatest._
+import cascading.tuple.{Tuple, Fields}
+import com.twitter.scalding.{JobTest, Tsv, TypedTsv, TupleConversions, TextLine}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.junit.runner.RunWith
+import org.scalatest.FunSpec
+import org.scalatest.junit.JUnitRunner
+import org.slf4j.LoggerFactory
+import parallelai.spyglass.hbase.HBaseSource
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import scala.util.parsing.json.JSON
+
+class CdxBackfillTest extends FlatSpec with Matchers {
+
+ import CdxBackfillJob._
+
+ it should "normalize mimetypes" in {
+ assert(CdxBackfillJob.normalizeMime("asdf") === "asdf")
+ assert(CdxBackfillJob.normalizeMime("application/pdf") === "application/pdf")
+ assert(CdxBackfillJob.normalizeMime("application/pdf+journal") === "application/pdf")
+ assert(CdxBackfillJob.normalizeMime("Application/PDF") === "application/pdf")
+ assert(CdxBackfillJob.normalizeMime("application/p") === "application/p")
+ assert(CdxBackfillJob.normalizeMime("application/xml+stuff") === "text/xml")
+ assert(CdxBackfillJob.normalizeMime("application/x-pdf") === "application/pdf")
+ assert(CdxBackfillJob.normalizeMime("application/x-html") === "application/x-html")
+ }
+
+ it should "filter CDX lines" in {
+ assert(true === keepCdx(lineToCdxLine(
+ """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""")))
+ // redirect
+ assert(false === keepCdx(lineToCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 301 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz")))
+ // not PDF
+ assert(false === keepCdx(lineToCdxLine(
+ """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf text/plain 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""")))
+ // invalid base32 SHA1
+ assert(false === keepCdx(lineToCdxLine(
+ """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FE010101010101010101VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""")))
+ assert(false === keepCdx(lineToCdxLine(
+ """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL33FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""")))
+ // dashed field
+ assert(false === keepCdx(lineToCdxLine(
+ """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 - application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""")))
+ }
+
+ it should "know what CDX lines are" in {
+ assert(true === isCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ assert(false === isCdxLine(""))
+ assert(false === isCdxLine(
+ " edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ assert(false === isCdxLine(
+ "#edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ // missing two fields
+ assert(false === isCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+ // extra field
+ assert(false === isCdxLine(
+ "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz -"))
+ }
+
+ it should "execute lineToRow" in {
+ // this particular test copied from python test_backfill_hbase_from_cdx.py
+ val row = cdxLineToRow(lineToCdxLine(
+ "eu,eui,cadmus)/bitstream/handle/1814/36635/rscas_2015_03.pdf;jsessionid=761393014319a39f40d32ae3eb3a853f?sequence=1 20170705062202 http://cadmus.eui.eu/bitstream/handle/1814/36635/RSCAS_2015_03.pdf%3Bjsessionid%3D761393014319A39F40D32AE3EB3A853F?sequence%3D1 application/PDF 200 MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J - - 854156 328850624 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"))
+
+ assert(row._1 == "sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J")
+ JSON.parseFull(row._2) match {
+ case Some(obj: Map[String, Any]) => {
+ assert(obj("u") == "http://cadmus.eui.eu/bitstream/handle/1814/36635/RSCAS_2015_03.pdf%3Bjsessionid%3D761393014319A39F40D32AE3EB3A853F?sequence%3D1")
+ assert(obj("f") == "CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
+ assert(obj("c") == 854156)
+ assert(obj("o") == 328850624)
+ assert(obj("d") == "2017-07-05T06:22:02Z")
+ }
+ case other => assert(false)
+ }
+ JSON.parseFull(row._3) match {
+ case Some(obj: Map[String, Any]) => {
+ assert(obj("surt") == "eu,eui,cadmus)/bitstream/handle/1814/36635/rscas_2015_03.pdf;jsessionid=761393014319a39f40d32ae3eb3a853f?sequence=1")
+ assert(obj("dt") == "20170705062202")
+ assert(obj("url") == "http://cadmus.eui.eu/bitstream/handle/1814/36635/RSCAS_2015_03.pdf%3Bjsessionid%3D761393014319A39F40D32AE3EB3A853F?sequence%3D1")
+ assert(obj("c_size") == 854156)
+ assert(obj("offset") == 328850624)
+ assert(obj("warc") == "CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
+ }
+ case other => assert(false)
+ }
+ assert(row._4 == "application/pdf")
+ }
+
+}
+
+@RunWith(classOf[JUnitRunner])
+class CdxBackfillJobTest extends FunSpec with TupleConversions {
+
+ val (testTable, testHost, testCdxFile) = ("test-table", "dummy-host:2181", "test_file.cdx")
+
+ val log = LoggerFactory.getLogger(this.getClass.getName)
+
+ val dummySizeBytes = Bytes.toBytes(100)
+
+ val sampleData = List(
+ List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), dummySizeBytes),
+ List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), dummySizeBytes),
+ List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), dummySizeBytes),
+ List(Bytes.toBytes("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT"), dummySizeBytes)
+ )
+ val sampleCdxLines = List(
+ // clean line
+ "0" -> """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""",
+ // has existing SHA1
+ "1" -> """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""",
+ // HTTP status code
+ "2" -> """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 301 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""",
+ // not CDX (prefixed with hash)
+ "3" -> """#edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz""",
+ // not PDF
+ "4" -> """edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/film 200 AAAAAEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"""
+ )
+
+ JobTest("sandcrawler.CdxBackfillJob")
+ .arg("test", "")
+ .arg("app.conf.path", "app.conf")
+ .arg("hbase-table", testTable)
+ .arg("zookeeper-hosts", testHost)
+ .arg("cdx-input-path", testCdxFile)
+ .arg("debug", "true")
+ .source[Tuple](CdxBackfillJob.getHBaseSource(testTable, testHost),
+ sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+ .source(TextLine(testCdxFile), sampleCdxLines)
+ .sink[(ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable, ImmutableBytesWritable)](CdxBackfillJob.getHBaseSink(testTable, testHost)) {
+ outputBuffer =>
+
+ val buf0 = outputBuffer(0)
+ val row0 = List(buf0._1, buf0._2, buf0._3, buf0._4).map(b => Bytes.toString(b.copyBytes()))
+
+ it("should return a 1-element list (after join).") {
+ assert(outputBuffer.size === 1)
+ }
+
+ it("should insert the valid, new CDX line") {
+ assert(row0(0) == "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G")
+ JSON.parseFull(row0(1)) match {
+ case Some(obj: Map[String, Any]) => {
+ assert(obj("u") == "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf")
+ assert(obj("f") == "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz")
+ assert(obj("c") == 210251)
+ assert(obj("o") == 931661233)
+ assert(obj("d") == "2017-08-28T23:31:54Z")
+ }
+ case other => assert(false)
+ }
+ JSON.parseFull(row0(2)) match {
+ case Some(obj: Map[String, Any]) => {
+ assert(obj("surt") == "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf")
+ assert(obj("dt") == "20170828233154")
+ assert(obj("url") == "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf")
+ assert(obj("c_size") == 210251)
+ assert(obj("offset") == 931661233)
+ assert(obj("warc") == "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz")
+ }
+ case other => assert(false)
+ }
+ assert(row0(3) == "application/pdf")
+ }
+ }
+ .run
+ .finish
+}
diff --git a/sql/Makefile b/sql/Makefile
new file mode 100644
index 0000000..860addb
--- /dev/null
+++ b/sql/Makefile
@@ -0,0 +1,35 @@
+
+SHELL=/bin/bash -euo pipefail
+TODAY ?= $(shell date --iso --utc)
+DATADIR ?= /srv/sandcrawler/tasks/$(TODAY)
+DATESLUG ?= $(shell date +%Y-%m-%d.%H%M%S)
+DATABASE_URL ?= sandcrawler
+
+.PHONY: help
+help: ## Print info about all commands
+ @echo "Commands:"
+ @echo
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}'
+
+.PHONY: create_datadir
+create_datadir:
+ mkdir -p $(DATADIR)/
+ sudo chmod a+rw $(DATADIR)/
+
+$(DATADIR)/.DB_DUMP:
+ sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip
+ mv $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip $(DATADIR)/sandcrawler_${DATESLUG}.pgdump
+ touch $@
+
+.PHONY: database-snapshot
+database-snapshot: create_datadir $(DATADIR)/.DB_DUMP ## Create SQL database snapshot
+ @echo
+
+$(DATADIR)/.DB_UPLOADED: $(DATADIR)/.DB_DUMP
+ ia upload --checksum sandcrawler_sqldump_$(TODAY) ia_sqldump_item_readme.md --remote-name=README.md -m collection:webgroup-internal-backups -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Sandcrawler SQL Database Snapshot ($(TODAY))"
+ ia upload --checksum sandcrawler_sqldump_$(TODAY) $(DATADIR)/sandcrawler_*.pgdump
+ touch $@
+
+.PHONY: upload-database-snapshot
+upload-database-snapshot: create_datadir database-snapshot $(DATADIR)/.DB_UPLOADED ## Upload database snapshot to archive.org
+ @echo
diff --git a/sql/README.md b/sql/README.md
index b171614..e488006 100644
--- a/sql/README.md
+++ b/sql/README.md
@@ -5,6 +5,21 @@ No primary storage of anything in this table. Everything should be rapidly
re-creatable from dumps, kafka topics (compressed), CDX, petabox metadata, etc.
This is a secondary view on all of that.
+## Create Database and User
+
+Create system user with your username like:
+
+ sudo su postgres
+ createuser -s bnewbold
+
+Create database using `diesel` tool (see fatcat rust docs for install notes):
+
+ # DANGER: will delete/recreate entire database
+ diesel database reset
+
+In the future would probably be better to create a real role/password and
+supply these via `DATABASE_URL` env variable.
+
## Schema
schema/database name is 'sandcrawler'
@@ -124,3 +139,30 @@ Questions we might want to answer
http get :3030/cdx?url=eq.https://coleccionables.mercadolibre.com.ar/arduino-pdf_Installments_NoInterest_BestSellers_YES
http get :3030/file_meta?sha1hex=eq.120582c855a7cc3c70a8527c560d7f27e6027278
+
+## Full SQL Database Dumps
+
+Run a dump in compressed, postgres custom format, not including `crossref` table (which is large and redundant):
+
+ export DATESLUG="`date +%Y-%m-%d.%H%M%S`"
+ time sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > sandcrawler_full_dbdump_${DATESLUG}.pgdump
+
+As of 2021-12-03, this process runs for about 6 hours and the compressed
+snapshot is 102 GBytes (compared with 940GB database disk consumption,
+including crossref).
+
+Then, upload to petabox as a backup:
+
+ ia upload sandcrawler_full_dbdump_YYYY-MM-DD -m mediatype:data -m collection:webgroup-internal-backups -m title:"Sandcrawler SQL Dump (YYYY-MM-DD)" sandcrawler_full_dbdump_${DATESLUG}.pgdump
+
+
+## SQL Database Restore
+
+To restore a dump (which will delete local database content, if any):
+
+ sudo su postgres
+ createuser --no-login web_anon
+ createuser -s sandcrawler
+ time pg_restore --jobs=4 --verbose --clean --if-exists --create --exit-on-error -d postgres sandcrawler_full_dbdump_2021-04-08.003952.pgdump
+
+Took about 2.5 hours.
diff --git a/sql/backfill/backfill.md b/sql/backfill/backfill.md
index f1a5f86..4a56065 100644
--- a/sql/backfill/backfill.md
+++ b/sql/backfill/backfill.md
@@ -76,6 +76,19 @@ In psql:
COPY fatcat_file FROM '/sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
# => COPY 24727350
+In 2021-11-26:
+
+ zcat file_export.json.gz \
+ | pv -l \
+ | jq -r 'select(.sha1 != null) | [.sha1, .ident, .release_ids[0], (.urls|length >= 1), .content_scope] | @tsv' \
+ | sort -S 8G \
+ | uniq -w 40 \
+ | pigz \
+ > fatcat_file.2021-11-26.tsv.gz
+
+ COPY fatcat_file FROM '/srv/sandcrawler/tasks/fatcat_file.2021-11-26.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # COPY 112086814
+
## `file_meta`
zcat /fast/download/file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.md5 != null) | [.sha1, .sha256, .md5, .size, .mimetype] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/file_meta.2019-07-07.tsv
diff --git a/sql/backfill/backfill_cdx.py b/sql/backfill/backfill_cdx.py
index 1c452ca..f929502 100755
--- a/sql/backfill/backfill_cdx.py
+++ b/sql/backfill/backfill_cdx.py
@@ -109,6 +109,7 @@ def stdin_to_pg():
info = parse_cdx_line(l)
if not info:
continue
+ # XXX: filter to, eg, PDF or octet/stream (derp)
batch.append(info)
counts['total'] += 1
if len(batch) >= 1000:
diff --git a/sql/dump_file_meta.sql b/sql/dump_file_meta.sql
new file mode 100644
index 0000000..a7d6c2b
--- /dev/null
+++ b/sql/dump_file_meta.sql
@@ -0,0 +1,12 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT sha1hex, row_to_json(file_meta)
+ FROM file_meta
+ ORDER BY sha1hex ASC
+)
+TO '/srv/sandcrawler/tasks/file_meta_dump.tsv'
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_regrobid_pdf.sql b/sql/dump_regrobid_pdf.sql
new file mode 100644
index 0000000..b846834
--- /dev/null
+++ b/sql/dump_regrobid_pdf.sql
@@ -0,0 +1,15 @@
+
+-- Run like:
+-- psql sandcrawler < dump_regrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf.2019-11-12.json
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT cdx.sha1hex, row_to_json(cdx) FROM cdx
+ WHERE cdx.mimetype = 'application/pdf'
+ AND EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL)
+)
+TO STDOUT
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_regrobid_pdf_petabox.sql b/sql/dump_regrobid_pdf_petabox.sql
new file mode 100644
index 0000000..e7c48f3
--- /dev/null
+++ b/sql/dump_regrobid_pdf_petabox.sql
@@ -0,0 +1,15 @@
+
+-- Run like:
+-- psql sandcrawler < dump_regrobid_pdf_petabox.sql
+-- cat dump_regrobid_pdf_petabox.2020-02-03.json | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf_petabox.2020-02-03.uniq.json
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox
+ WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL)
+)
+TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json'
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_bulk.sql b/sql/dump_reingest_bulk.sql
new file mode 100644
index 0000000..698db7a
--- /dev/null
+++ b/sql/dump_reingest_bulk.sql
@@ -0,0 +1,31 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html')
+ AND ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '24 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '181 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ OR ingest_file_result.status like 'cdx-error'
+ OR ingest_file_result.status like 'petabox-error'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_bulk_current.rows.json';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_old.sql b/sql/dump_reingest_old.sql
new file mode 100644
index 0000000..7473420
--- /dev/null
+++ b/sql/dump_reingest_old.sql
@@ -0,0 +1,36 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '6 day'::INTERVAL
+ -- AND ingest_request.created > NOW() - '181 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest-container'
+ OR ingest_request.ingest_request_source = 'unpaywall'
+ OR ingest_request.ingest_request_source = 'arxiv'
+ OR ingest_request.ingest_request_source = 'pmc'
+ OR ingest_request.ingest_request_source = 'doaj'
+ OR ingest_request.ingest_request_source = 'dblp')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ -- OR ingest_file_result.status like 'no-capture'
+ -- OR ingest_file_result.status like 'cdx-error'
+ -- OR ingest_file_result.status like 'petabox-error'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_old_current.rows.json';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql
new file mode 100644
index 0000000..dbeb199
--- /dev/null
+++ b/sql/dump_reingest_quarterly.sql
@@ -0,0 +1,47 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ OR ingest_request.ingest_type = 'xml'
+ OR ingest_request.ingest_type = 'component')
+ AND ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '8 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '91 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest-container'
+ OR ingest_request.ingest_request_source = 'unpaywall'
+ OR ingest_request.ingest_request_source = 'arxiv'
+ OR ingest_request.ingest_request_source = 'pmc'
+ OR ingest_request.ingest_request_source = 'doaj'
+ OR ingest_request.ingest_request_source = 'dblp')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ -- OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_quarterly_current.rows.json';
+
+-- bulk re-tries would be:
+-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+
+ROLLBACK;
diff --git a/sql/dump_reingest_spn.sql b/sql/dump_reingest_spn.sql
new file mode 100644
index 0000000..a83125c
--- /dev/null
+++ b/sql/dump_reingest_spn.sql
@@ -0,0 +1,36 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ OR ingest_request.ingest_type = 'xml'
+ OR ingest_request.ingest_type = 'component')
+ AND ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '6 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '180 day'::INTERVAL
+ AND ingest_request.ingest_request_source = 'savepapernow-web'
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ -- OR ingest_file_result.status = 'cdx-error'
+ -- OR ingest_file_result.status = 'wayback-error'
+ -- OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ -- OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_terminalstatus.sql b/sql/dump_reingest_terminalstatus.sql
new file mode 100644
index 0000000..b72a096
--- /dev/null
+++ b/sql/dump_reingest_terminalstatus.sql
@@ -0,0 +1,34 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '72 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '10 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ AND ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ OR ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 404
+ )
+ AND (
+ ingest_request.base_url LIKE 'https://doi.org/10.3390/%'
+ OR ingest_request.base_url LIKE 'https://doi.org/10.1103/%'
+ OR ingest_request.base_url LIKE 'https://doi.org/10.1155/%'
+ )
+) TO '/srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json';
+
+-- bulk re-tries would be:
+-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+
+ROLLBACK;
diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql
new file mode 100644
index 0000000..a019938
--- /dev/null
+++ b/sql/dump_reingest_weekly.sql
@@ -0,0 +1,42 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ OR ingest_request.ingest_type = 'xml'
+ OR ingest_request.ingest_type = 'component')
+ AND ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '8 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '8 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest-container')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ -- OR ingest_file_result.status = 'cdx-error'
+ -- OR ingest_file_result.status = 'wayback-error'
+ -- OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ -- OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_weekly_current.rows.json';
+
+-- bulk re-tries would be:
+-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+
+ROLLBACK;
diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql
new file mode 100644
index 0000000..a7fb920
--- /dev/null
+++ b/sql/dump_unextracted_pdf.sql
@@ -0,0 +1,22 @@
+
+-- Run like:
+-- psql sandcrawler < dump_unextracted_pdf.sql
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE cdx.sha1hex IS NOT NULL
+ --AND fatcat_file.sha1hex IS NOT NULL
+ AND ingest_file_result.terminal_sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+)
+TO '/srv/sandcrawler/tasks/dump_unextracted_pdf.ingest.2020-10-21.json'
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_unextracted_pdf_petabox.sql b/sql/dump_unextracted_pdf_petabox.sql
new file mode 100644
index 0000000..bb9f162
--- /dev/null
+++ b/sql/dump_unextracted_pdf_petabox.sql
@@ -0,0 +1,18 @@
+
+-- Run like:
+-- psql sandcrawler < dump_unextracted_pdf_petabox.sql
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox)
+ FROM grobid
+ LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex
+ LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+ WHERE petabox.sha1hex IS NOT NULL
+ AND pdf_meta.sha1hex IS NULL
+)
+TO '/srv/sandcrawler/tasks/dump_unextracted_pdf_petabox.2020-07-22.json'
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_ungrobid_pdf.sql b/sql/dump_ungrobid_pdf.sql
new file mode 100644
index 0000000..81caf18
--- /dev/null
+++ b/sql/dump_ungrobid_pdf.sql
@@ -0,0 +1,18 @@
+
+-- Run like:
+-- psql sandcrawler < dump_ungrobid_pdf.sql
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM cdx
+ WHERE cdx.mimetype = 'application/pdf'
+ AND NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL)
+ -- uncomment/comment this to control whether only fatcat files are included
+ --AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex)
+)
+TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf.fatcat.2020-08-04.json'
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_ungrobid_pdf_petabox.sql b/sql/dump_ungrobid_pdf_petabox.sql
new file mode 100644
index 0000000..b7a1db2
--- /dev/null
+++ b/sql/dump_ungrobid_pdf_petabox.sql
@@ -0,0 +1,17 @@
+
+-- Run like:
+-- psql sandcrawler < dump_ungrobid_pdf_petabox.sql
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox)
+ FROM petabox
+ WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL)
+ -- uncomment/comment this to control whether only fatcat files are included
+ AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE petabox.sha1hex = fatcat_file.sha1hex)
+)
+TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf_petabox.2020-08-04.json'
+WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/dump_unmatched_glutton_pdf.sql b/sql/dump_unmatched_glutton_pdf.sql
new file mode 100644
index 0000000..333ff7b
--- /dev/null
+++ b/sql/dump_unmatched_glutton_pdf.sql
@@ -0,0 +1,19 @@
+
+-- Run like:
+-- psql sandcrawler < THING.sql > THING.2019-09-23.json
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(grobid)
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL
+ LIMIT 1000
+)
+TO '/srv/sandcrawler/tasks/dump_unmatched_glutton_pdf.2020-06-30.json';
+--TO STDOUT
+--WITH NULL '';
+
+ROLLBACK;
diff --git a/sql/example.env b/sql/example.env
new file mode 100644
index 0000000..3a13689
--- /dev/null
+++ b/sql/example.env
@@ -0,0 +1 @@
+DATABASE_URL="postgres://fatcat:tactaf@localhost/sandcrawler"
diff --git a/sql/ingest_again.md b/sql/ingest_again.md
new file mode 100644
index 0000000..b749557
--- /dev/null
+++ b/sql/ingest_again.md
@@ -0,0 +1,158 @@
+
+## re-ingest some broken
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'spn2-%'
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
+ ) TO '/srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'cdx-error'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'cdx-error'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+ AND ingest_request.ingest_request_source != 'fatcat-ingest')
+ ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'wayback-error'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ ) TO '/srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'gateway-timeout'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ ) TO '/srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'petabox-error'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ ) TO '/srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json';
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json
+
+Push to kafka (shuffled):
+
+ cat reingest_spn2-error_current.json reingest_cdx-error_current.json reingest_wayback-error_current.json reingest_petabox-error_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+ cat reingest_gateway-timeout.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 0
+
+ cat reingest_cdx-error_bulk_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Push to kafka (not shuffled):
+
+ cat reingest_spn2-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_cdx-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_cdx-error_bulk_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat reingest_wayback-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_gateway-timeout.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_petabox-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## just recent fatcat-ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ -- AND ingest_file_result.updated > NOW() - '24 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '7 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND (ingest_file_result.status like 'spn2-%'
+ OR ingest_file_result.status like 'cdx-error'
+ OR ingest_file_result.status like 'gateway-timeout'
+ OR ingest_file_result.status like 'wayback-error'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ ) TO '/srv/sandcrawler/tasks/reingest_fatcat_current.rows.json';
+
+ # note: shuf
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json
+
+ cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## specific domains
+
+protocols.io:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url LIKE '%10.17504/protocols.io%'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+biorxiv/medrxiv:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url LIKE '%10.1101/20%'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
diff --git a/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt b/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt
new file mode 100644
index 0000000..b684400
--- /dev/null
+++ b/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt
@@ -0,0 +1,326 @@
+ doi_prefix | status | count
+------------+-------------------------------+--------
+ 10.1001 | | 230
+ 10.1002 | | 3914
+ 10.1002 | terminal-bad-status | 1540
+ 10.1002 | forbidden | 1072
+ 10.1002 | redirect-loop | 995
+ 10.1002 | no-pdf-link | 210
+ 10.1016 | | 7976
+ 10.1016 | no-pdf-link | 4648
+ 10.1016 | terminal-bad-status | 1778
+ 10.1016 | forbidden | 622
+ 10.1016 | spn2-error:too-many-redirects | 344
+ 10.1016 | redirect-loop | 225
+ 10.1017 | | 2040
+ 10.1017 | no-pdf-link | 720
+ 10.1017 | success | 441
+ 10.1017 | link-loop | 371
+ 10.1017 | bad-redirect | 227
+ 10.1021 | | 1722
+ 10.1021 | blocked-cookie | 1552
+ 10.1029 | | 248
+ 10.1039 | | 1160
+ 10.1039 | redirect-loop | 486
+ 10.1039 | spn2-error:too-many-redirects | 395
+ 10.1039 | spn2-wayback-error | 213
+ 10.1051 | | 695
+ 10.1051 | success | 557
+ 10.1055 | | 541
+ 10.1055 | not-found | 295
+ 10.1055 | redirect-loop | 213
+ 10.1057 | | 2835
+ 10.1057 | redirect-loop | 2617
+ 10.1061 | | 550
+ 10.1061 | spn2-error:too-many-redirects | 425
+ 10.1063 | | 600
+ 10.1063 | spn2-error:too-many-redirects | 328
+ 10.1080 | | 3801
+ 10.1080 | blocked-cookie | 2431
+ 10.1080 | terminal-bad-status | 711
+ 10.1080 | forbidden | 341
+ 10.1081 | | 299
+ 10.1081 | link-loop | 222
+ 10.1089 | | 236
+ 10.1089 | blocked-cookie | 228
+ 10.1093 | | 12805
+ 10.1093 | link-loop | 8627
+ 10.1093 | redirect-loop | 1659
+ 10.1093 | no-pdf-link | 1475
+ 10.1093 | bad-redirect | 428
+ 10.1093 | success | 391
+ 10.1097 | | 1497
+ 10.1097 | no-pdf-link | 503
+ 10.1097 | link-loop | 346
+ 10.1097 | spn2-error:too-many-redirects | 259
+ 10.1097 | terminal-bad-status | 202
+ 10.1101 | | 1859
+ 10.1101 | redirect-loop | 993
+ 10.1101 | forbidden | 703
+ 10.1103 | | 597
+ 10.1103 | not-found | 534
+ 10.1108 | | 1055
+ 10.1108 | no-pdf-link | 945
+ 10.1109 | | 7067
+ 10.1109 | spn2-error:too-many-redirects | 6299
+ 10.1109 | success | 667
+ 10.1111 | | 2099
+ 10.1111 | redirect-loop | 1331
+ 10.1111 | terminal-bad-status | 313
+ 10.1111 | forbidden | 226
+ 10.1115 | | 1278
+ 10.1115 | bad-redirect | 707
+ 10.1117 | | 561
+ 10.1117 | spn2-error:too-many-redirects | 501
+ 10.1126 | | 214
+ 10.1136 | | 1989
+ 10.1136 | success | 1463
+ 10.1136 | link-loop | 294
+ 10.1142 | | 300
+ 10.1142 | blocked-cookie | 237
+ 10.1145 | | 440
+ 10.1145 | blocked-cookie | 354
+ 10.1155 | | 480
+ 10.1155 | success | 474
+ 10.11588 | | 506
+ 10.11588 | no-pdf-link | 264
+ 10.11588 | success | 236
+ 10.1159 | | 226
+ 10.11606 | | 304
+ 10.1161 | | 1142
+ 10.1161 | blocked-cookie | 1011
+ 10.1163 | | 2261
+ 10.1163 | link-loop | 1767
+ 10.1163 | success | 348
+ 10.11648 | | 405
+ 10.11648 | success | 404
+ 10.1182 | | 2125
+ 10.1182 | no-pdf-link | 2024
+ 10.1183 | | 987
+ 10.1183 | redirect-loop | 838
+ 10.1186 | | 1481
+ 10.1186 | success | 1412
+ 10.1201 | | 7649
+ 10.1201 | link-loop | 5383
+ 10.1201 | forbidden | 1504
+ 10.1201 | no-pdf-link | 312
+ 10.1299 | | 264
+ 10.1299 | no-pdf-link | 209
+ 10.13134 | | 201
+ 10.1353 | | 549
+ 10.1353 | terminal-bad-status | 443
+ 10.1371 | | 552
+ 10.1371 | success | 542
+ 10.14201 | | 656
+ 10.14201 | success | 366
+ 10.14361 | | 647
+ 10.14361 | link-loop | 585
+ 10.14746 | | 260
+ 10.14746 | success | 232
+ 10.1504 | | 527
+ 10.1504 | no-pdf-link | 501
+ 10.15122 | | 246
+ 10.15122 | success | 243
+ 10.1515 | | 16240
+ 10.1515 | link-loop | 12589
+ 10.1515 | success | 1941
+ 10.1515 | no-pdf-link | 1008
+ 10.1515 | not-found | 283
+ 10.15405 | | 229
+ 10.15405 | success | 218
+ 10.1553 | | 418
+ 10.1553 | no-pdf-link | 396
+ 10.1590 | | 655
+ 10.1590 | success | 623
+ 10.17104 | | 1202
+ 10.17104 | no-pdf-link | 953
+ 10.17104 | bad-redirect | 249
+ 10.17605 | | 368
+ 10.17605 | not-found | 337
+ 10.17615 | | 9401
+ 10.17615 | redirect-loop | 5720
+ 10.17615 | spn2-wayback-error | 3099
+ 10.17615 | spn2-cdx-lookup-failure | 201
+ 10.17863 | | 438
+ 10.18148 | | 465
+ 10.18148 | success | 462
+ 10.18720 | | 210
+ 10.18821 | | 476
+ 10.18821 | redirect-loop | 366
+ 10.20345 | | 222
+ 10.20345 | terminal-bad-status | 215
+ 10.20546 | | 244
+ 10.20546 | no-pdf-link | 241
+ 10.21037 | | 232
+ 10.2118 | | 903
+ 10.2118 | redirect-loop | 853
+ 10.21203 | | 1824
+ 10.21203 | success | 1545
+ 10.2139 | | 1493
+ 10.2139 | link-loop | 1145
+ 10.2147 | | 318
+ 10.2147 | success | 267
+ 10.2172 | | 282
+ 10.2174 | | 363
+ 10.2174 | no-pdf-link | 320
+ 10.2196 | | 265
+ 10.2208 | | 299
+ 10.22215 | | 218
+ 10.22215 | success | 217
+ 10.22323 | | 289
+ 10.22323 | success | 262
+ 10.22533 | | 395
+ 10.22533 | success | 393
+ 10.22541 | | 291
+ 10.22541 | success | 275
+ 10.23919 | | 426
+ 10.23919 | spn2-error:too-many-redirects | 403
+ 10.24034 | | 319
+ 10.24034 | spn2-error | 203
+ 10.24355 | | 15360
+ 10.24355 | no-pdf-link | 15228
+ 10.24411 | | 1506
+ 10.24411 | forbidden | 823
+ 10.24411 | redirect-loop | 647
+ 10.25335 | | 550
+ 10.25335 | no-pdf-link | 550
+ 10.25365 | | 429
+ 10.25365 | success | 424
+ 10.25384 | | 338
+ 10.25384 | success | 249
+ 10.25646 | | 239
+ 10.26197 | no-pdf-link | 303
+ 10.26197 | | 303
+ 10.26226 | | 272
+ 10.26278 | | 1291
+ 10.26278 | redirect-loop | 756
+ 10.26278 | spn2-error:too-many-redirects | 509
+ 10.29327 | | 232
+ 10.2991 | | 307
+ 10.2991 | spn2-wayback-error | 227
+ 10.30965 | | 722
+ 10.30965 | link-loop | 709
+ 10.3109 | | 801
+ 10.3109 | link-loop | 572
+ 10.3109 | forbidden | 228
+ 10.31219 | | 951
+ 10.31219 | redirect-loop | 518
+ 10.31219 | spn2-wayback-error | 356
+ 10.31274 | | 296
+ 10.31743 | | 403
+ 10.31743 | success | 294
+ 10.31857 | | 209
+ 10.3233 | | 471
+ 10.33448 | | 213
+ 10.33448 | success | 212
+ 10.3389 | | 1459
+ 10.3389 | success | 1417
+ 10.3390 | | 4511
+ 10.3390 | success | 3577
+ 10.3390 | terminal-bad-status | 485
+ 10.3390 | forbidden | 379
+ 10.3406 | | 243
+ 10.3406 | terminal-bad-status | 213
+ 10.34944 | | 527
+ 10.34944 | success | 459
+ 10.35016 | | 688
+ 10.35016 | no-pdf-link | 687
+ 10.36347 | success | 213
+ 10.36347 | | 213
+ 10.37747 | | 213
+ 10.37747 | no-pdf-link | 213
+ 10.37904 | | 227
+ 10.37904 | no-pdf-link | 226
+ 10.3917 | | 347
+ 10.3917 | redirect-loop | 208
+ 10.3923 | | 356
+ 10.3923 | redirect-loop | 254
+ 10.3929 | | 317
+ 10.3929 | terminal-bad-status | 310
+ 10.3931 | | 279
+ 10.3931 | no-pdf-link | 279
+ 10.4000 | | 7828
+ 10.4000 | success | 3485
+ 10.4000 | spn2-wayback-error | 2142
+ 10.4000 | redirect-loop | 2106
+ 10.4018 | | 249
+ 10.4018 | not-found | 240
+ 10.4103 | | 726
+ 10.4103 | remote-server-error | 343
+ 10.4103 | redirect-loop | 324
+ 10.4159 | | 286
+ 10.4159 | link-loop | 238
+ 10.4324 | | 19398
+ 10.4324 | link-loop | 12471
+ 10.4324 | forbidden | 3632
+ 10.4324 | not-found | 2283
+ 10.4324 | terminal-bad-status | 645
+ 10.4324 | success | 208
+ 10.47295 | | 456
+ 10.47295 | success | 449
+ 10.47513 | | 218
+ 10.47513 | no-pdf-link | 203
+ 10.48084 | success | 538
+ 10.48084 | | 538
+ 10.5040 | | 375
+ 10.5040 | no-pdf-link | 365
+ 10.5167 | | 290
+ 10.5167 | redirect-loop | 278
+ 10.5169 | | 360
+ 10.5169 | no-pdf-link | 355
+ 10.5194 | | 917
+ 10.5194 | success | 887
+ 10.5216 | | 213
+ 10.5220 | no-pdf-link | 397
+ 10.5220 | | 397
+ 10.5281 | | 22551
+ 10.5281 | terminal-bad-status | 12158
+ 10.5281 | success | 4901
+ 10.5281 | no-pdf-link | 4754
+ 10.5281 | spn2-error:unknown | 360
+ 10.5282 | | 228
+ 10.5451 | | 2068
+ 10.5451 | success | 1071
+ 10.5451 | terminal-bad-status | 817
+ 10.5753 | | 268
+ 10.5753 | success | 264
+ 10.5771 | | 941
+ 10.5771 | no-pdf-link | 397
+ 10.5771 | bad-redirect | 269
+ 10.5771 | link-loop | 238
+ 10.6068 | | 441
+ 10.6068 | no-pdf-link | 384
+ 10.6084 | | 917
+ 10.6084 | no-pdf-link | 520
+ 10.6084 | success | 368
+ 10.7287 | | 234
+ 10.7287 | no-pdf-link | 212
+ 10.7312 | | 382
+ 10.7312 | link-loop | 291
+ 10.7554 | | 205
+ 10.7891 | | 380
+ 10.7891 | no-pdf-link | 376
+ 10.7916 | | 331
+ 10.7916 | no-pdf-link | 201
+ 10.7939 | | 535
+ 10.7939 | no-pdf-link | 527
+ | | 272831
+ | success | 62298
+ | no-pdf-link | 60737
+ | link-loop | 48558
+ | redirect-loop | 26842
+ | terminal-bad-status | 22685
+ | spn2-error:too-many-redirects | 11174
+ | forbidden | 10900
+ | spn2-wayback-error | 7796
+ | blocked-cookie | 6961
+ | not-found | 5468
+ | bad-redirect | 2666
+ | spn2-error | 2398
+ | spn2-cdx-lookup-failure | 1374
+ | petabox-error | 678
+ | remote-server-error | 461
+ | wrong-mimetype | 443
+ | spn2-error:proxy-error | 420
+ | spn2-error:unknown | 360
+(323 rows)
diff --git a/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt b/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt
new file mode 100644
index 0000000..28dd0d0
--- /dev/null
+++ b/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt
@@ -0,0 +1,307 @@
+ domain | status | count
+-------------------------------------------------------------------+-------------------------------+--------
+ 202.148.31.178 | | 298
+ academic.oup.com | | 1624
+ academic.oup.com | no-pdf-link | 673
+ academic.oup.com | bad-redirect | 444
+ academic.oup.com | link-loop | 358
+ aip.scitation.org | | 257
+ apps.crossref.org | | 1414
+ apps.crossref.org | no-pdf-link | 1410
+ article.sciencepublishinggroup.com | | 404
+ article.sciencepublishinggroup.com | success | 404
+ arxiv.org | | 24340
+ arxiv.org | success | 22381
+ arxiv.org | terminal-bad-status | 1260
+ arxiv.org | no-pdf-link | 412
+ arxiv.org | no-capture | 262
+ ashpublications.org | | 2049
+ ashpublications.org | no-pdf-link | 2024
+ asmedigitalcollection.asme.org | | 1245
+ asmedigitalcollection.asme.org | bad-redirect | 707
+ assets.researchsquare.com | | 1549
+ assets.researchsquare.com | success | 1546
+ bioone.org | | 201
+ biorxiv.org | redirect-loop | 702
+ biorxiv.org | | 702
+ blogs.ethz.ch | | 687
+ blogs.ethz.ch | no-pdf-link | 686
+ books.openedition.org | | 446
+ books.openedition.org | redirect-loop | 382
+ brill.com | | 2203
+ brill.com | link-loop | 1779
+ brill.com | success | 359
+ catalog.paradisec.org.au | | 770
+ catalog.paradisec.org.au | redirect-loop | 756
+ cdr.lib.unc.edu | | 9432
+ cdr.lib.unc.edu | redirect-loop | 5720
+ cdr.lib.unc.edu | spn2-wayback-error | 3187
+ cdr.lib.unc.edu | spn2-cdx-lookup-failure | 201
+ classiques-garnier.com | | 246
+ classiques-garnier.com | success | 243
+ content.iospress.com | | 242
+ content.taylorfrancis.com | | 309
+ content.taylorfrancis.com | terminal-bad-status | 309
+ curve.carleton.ca | success | 201
+ curve.carleton.ca | | 201
+ cyberdoi.ru | redirect-loop | 647
+ cyberdoi.ru | | 647
+ czasopisma.kul.pl | | 402
+ czasopisma.kul.pl | success | 294
+ d.lib.msu.edu | | 550
+ d.lib.msu.edu | no-pdf-link | 550
+ d197for5662m48.cloudfront.net | success | 276
+ d197for5662m48.cloudfront.net | | 276
+ dergipark.org.tr | | 674
+ dergipark.org.tr | no-pdf-link | 255
+ dergipark.org.tr | success | 248
+ digi.ub.uni-heidelberg.de | no-pdf-link | 261
+ digi.ub.uni-heidelberg.de | | 261
+ dl.acm.org | | 441
+ dl.acm.org | blocked-cookie | 361
+ dlc.library.columbia.edu | | 201
+ dlc.library.columbia.edu | no-pdf-link | 201
+ doi.ala.org.au | | 308
+ doi.ala.org.au | no-pdf-link | 308
+ doi.org | | 474
+ doi.org | terminal-bad-status | 344
+ downloads.hindawi.com | | 479
+ downloads.hindawi.com | success | 478
+ edoc.rki.de | | 238
+ edoc.unibas.ch | | 2018
+ edoc.unibas.ch | success | 1067
+ edoc.unibas.ch | terminal-bad-status | 817
+ elib.spbstu.ru | | 205
+ elifesciences.org | | 204
+ era.library.ualberta.ca | | 531
+ era.library.ualberta.ca | no-pdf-link | 527
+ erj.ersjournals.com | | 951
+ erj.ersjournals.com | redirect-loop | 829
+ europepmc.org | | 289
+ europepmc.org | success | 283
+ figshare.com | | 233
+ figshare.com | no-pdf-link | 208
+ fjfsdata01prod.blob.core.windows.net | | 1430
+ fjfsdata01prod.blob.core.windows.net | success | 1418
+ hw.oeaw.ac.at | | 283
+ hw.oeaw.ac.at | no-pdf-link | 283
+ idb.ub.uni-tuebingen.de | | 216
+ idb.ub.uni-tuebingen.de | terminal-bad-status | 215
+ ieeexplore.ieee.org | | 7561
+ ieeexplore.ieee.org | spn2-error:too-many-redirects | 6732
+ ieeexplore.ieee.org | success | 683
+ ijgc.bmj.com | | 411
+ ijgc.bmj.com | success | 399
+ jamanetwork.com | | 229
+ jitc.bmj.com | | 849
+ jitc.bmj.com | success | 773
+ journals.aps.org | | 539
+ journals.aps.org | not-found | 534
+ journals.lww.com | | 1124
+ journals.lww.com | no-pdf-link | 547
+ journals.lww.com | link-loop | 399
+ journals.openedition.org | | 7366
+ journals.openedition.org | success | 3484
+ journals.openedition.org | spn2-wayback-error | 2120
+ journals.openedition.org | redirect-loop | 1720
+ journals.plos.org | | 552
+ journals.plos.org | success | 542
+ kiss.kstudy.com | | 306
+ kiss.kstudy.com | no-pdf-link | 292
+ lib.dr.iastate.edu | | 297
+ link.springer.com | | 2830
+ link.springer.com | redirect-loop | 2625
+ linkinghub.elsevier.com | | 970
+ linkinghub.elsevier.com | forbidden | 415
+ linkinghub.elsevier.com | spn2-error:too-many-redirects | 357
+ medrxiv.org | | 287
+ medrxiv.org | redirect-loop | 287
+ muse.jhu.edu | | 470
+ muse.jhu.edu | terminal-bad-status | 443
+ ojs.ub.uni-konstanz.de | | 463
+ ojs.ub.uni-konstanz.de | success | 462
+ onlinelibrary.wiley.com | | 2064
+ onlinelibrary.wiley.com | terminal-bad-status | 1973
+ osf.io | | 1394
+ osf.io | redirect-loop | 589
+ osf.io | spn2-wayback-error | 425
+ osf.io | not-found | 342
+ othes.univie.ac.at | | 424
+ othes.univie.ac.at | success | 424
+ oxford.universitypressscholarship.com | | 8999
+ oxford.universitypressscholarship.com | link-loop | 8282
+ oxford.universitypressscholarship.com | no-pdf-link | 695
+ oxfordhandbooks.com | redirect-loop | 460
+ oxfordhandbooks.com | | 460
+ papers.ssrn.com | | 1313
+ papers.ssrn.com | link-loop | 1145
+ peerj.com | | 313
+ peerj.com | no-pdf-link | 212
+ periodicos.urca.br | | 446
+ periodicos.urca.br | success | 439
+ pos.sissa.it | | 277
+ pos.sissa.it | success | 262
+ preprints.jmir.org | | 242
+ pressto.amu.edu.pl | | 260
+ pressto.amu.edu.pl | success | 232
+ publikationsserver.tu-braunschweig.de | | 15358
+ publikationsserver.tu-braunschweig.de | no-pdf-link | 15228
+ publons.com | | 2810
+ publons.com | redirect-loop | 2359
+ publons.com | no-pdf-link | 444
+ pubs.acs.org | | 1647
+ pubs.acs.org | blocked-cookie | 1553
+ pubs.rsc.org | | 765
+ pubs.rsc.org | redirect-loop | 486
+ pubs.rsc.org | spn2-wayback-error | 214
+ res.mdpi.com | | 3620
+ res.mdpi.com | success | 3591
+ revistas.usal.es | | 580
+ revistas.usal.es | success | 298
+ revues.imist.ma | | 229
+ rsdjournal.org | | 213
+ rsdjournal.org | success | 212
+ s3-eu-west-1.amazonaws.com | | 764
+ s3-eu-west-1.amazonaws.com | success | 763
+ s3-euw1-ap-pe-ws4-capi2-distribution-p.s3-eu-west-1.amazonaws.com | | 324
+ s3-euw1-ap-pe-ws4-capi2-distribution-p.s3-eu-west-1.amazonaws.com | success | 324
+ saspublishers.com | | 213
+ saspublishers.com | success | 213
+ scholarshare.temple.edu | | 524
+ scholarshare.temple.edu | success | 464
+ sol.sbc.org.br | | 268
+ sol.sbc.org.br | success | 264
+ statisticaldatasets.data-planet.com | | 442
+ statisticaldatasets.data-planet.com | no-pdf-link | 390
+ watermark.silverchair.com | | 521
+ watermark.silverchair.com | success | 514
+ www.ahajournals.org | | 1061
+ www.ahajournals.org | blocked-cookie | 1011
+ www.atlantis-press.com | | 308
+ www.atlantis-press.com | spn2-wayback-error | 228
+ www.beck-elibrary.de | | 1202
+ www.beck-elibrary.de | no-pdf-link | 953
+ www.beck-elibrary.de | bad-redirect | 249
+ www.cairn.info | | 255
+ www.cairn.info | redirect-loop | 208
+ www.cambridge.org | | 2061
+ www.cambridge.org | no-pdf-link | 727
+ www.cambridge.org | success | 485
+ www.cambridge.org | link-loop | 388
+ www.cambridge.org | bad-redirect | 252
+ www.confer.cz | | 227
+ www.confer.cz | no-pdf-link | 226
+ www.dbpia.co.kr | | 773
+ www.dbpia.co.kr | no-pdf-link | 679
+ www.degruyter.com | | 17046
+ www.degruyter.com | link-loop | 14202
+ www.degruyter.com | success | 2201
+ www.degruyter.com | not-found | 235
+ www.dovepress.com | | 316
+ www.dovepress.com | success | 267
+ www.e-manuscripta.ch | | 384
+ www.e-manuscripta.ch | no-pdf-link | 383
+ www.e-periodica.ch | | 358
+ www.e-periodica.ch | no-pdf-link | 355
+ www.e-rara.ch | no-pdf-link | 279
+ www.e-rara.ch | | 279
+ www.e3s-conferences.org | | 426
+ www.e3s-conferences.org | success | 419
+ www.elibrary.ru | | 303
+ www.elibrary.ru | no-pdf-link | 301
+ www.emerald.com | | 943
+ www.emerald.com | no-pdf-link | 933
+ www.etasr.com | | 466
+ www.etasr.com | success | 466
+ www.eurekaselect.com | | 345
+ www.eurekaselect.com | no-pdf-link | 321
+ www.europeanproceedings.com | | 218
+ www.europeanproceedings.com | success | 218
+ www.finersistemas.com | success | 397
+ www.finersistemas.com | | 397
+ www.humankineticslibrary.com | no-pdf-link | 321
+ www.humankineticslibrary.com | | 321
+ www.ijcmas.com | | 251
+ www.ijcmas.com | no-pdf-link | 248
+ www.inderscience.com | | 524
+ www.inderscience.com | no-pdf-link | 501
+ www.ingentaconnect.com | | 366
+ www.ingentaconnect.com | no-pdf-link | 349
+ www.jstage.jst.go.jp | | 1591
+ www.jstage.jst.go.jp | success | 862
+ www.jstage.jst.go.jp | no-pdf-link | 567
+ www.jstor.org | | 351
+ www.karger.com | | 224
+ www.liebertpub.com | | 236
+ www.liebertpub.com | blocked-cookie | 228
+ www.mdpi.com | | 694
+ www.mdpi.com | terminal-bad-status | 480
+ www.medlit.ru | | 458
+ www.medlit.ru | redirect-loop | 366
+ www.morressier.com | | 285
+ www.morressier.com | no-pdf-link | 253
+ www.njca.info | | 223
+ www.njca.info | remote-server-error | 222
+ www.nomos-elibrary.de | | 913
+ www.nomos-elibrary.de | no-pdf-link | 379
+ www.nomos-elibrary.de | bad-redirect | 265
+ www.nomos-elibrary.de | link-loop | 236
+ www.onepetro.org | | 895
+ www.onepetro.org | redirect-loop | 853
+ www.osti.gov | | 212
+ www.persee.fr | | 232
+ www.persee.fr | terminal-bad-status | 213
+ www.repository.cam.ac.uk | | 439
+ www.research-collection.ethz.ch | | 312
+ www.research-collection.ethz.ch | terminal-bad-status | 310
+ www.revistas.ufg.br | | 212
+ www.schoeningh.de | | 371
+ www.schoeningh.de | link-loop | 366
+ www.scialert.net | | 276
+ www.scialert.net | redirect-loop | 254
+ www.scielo.br | | 644
+ www.scielo.br | success | 624
+ www.sciencedirect.com | | 6523
+ www.sciencedirect.com | no-pdf-link | 4668
+ www.sciencedirect.com | terminal-bad-status | 1737
+ www.scitepress.org | no-pdf-link | 397
+ www.scitepress.org | | 397
+ www.tandfonline.com | | 3448
+ www.tandfonline.com | blocked-cookie | 2446
+ www.tandfonline.com | terminal-bad-status | 714
+ www.taylorfrancis.com | | 21292
+ www.taylorfrancis.com | link-loop | 18648
+ www.taylorfrancis.com | forbidden | 2022
+ www.taylorfrancis.com | terminal-bad-status | 518
+ www.thieme-connect.de | | 513
+ www.thieme-connect.de | not-found | 292
+ www.thieme-connect.de | redirect-loop | 213
+ www.whateveryoneneedstoknow.com | | 1174
+ www.whateveryoneneedstoknow.com | redirect-loop | 1163
+ www.worldscientific.com | | 293
+ www.worldscientific.com | blocked-cookie | 240
+ www.zora.uzh.ch | | 290
+ www.zora.uzh.ch | redirect-loop | 278
+ zenodo.org | | 22202
+ zenodo.org | terminal-bad-status | 12158
+ zenodo.org | success | 4923
+ zenodo.org | no-pdf-link | 4788
+ | | 280719
+ | success | 85143
+ | no-pdf-link | 61335
+ | link-loop | 48566
+ | redirect-loop | 26845
+ | terminal-bad-status | 23955
+ | spn2-wayback-error | 7920
+ | spn2-error:too-many-redirects | 7175
+ | blocked-cookie | 6980
+ | forbidden | 2912
+ | bad-redirect | 2666
+ | spn2-error | 1943
+ | not-found | 1762
+ | spn2-cdx-lookup-failure | 1376
+ | wrong-mimetype | 467
+ | remote-server-error | 388
+ | spn2-error:proxy-error | 295
+ | no-capture | 262
+(304 rows)
diff --git a/sql/migrations/00000000000000_diesel_initial_setup/down.sql b/sql/migrations/00000000000000_diesel_initial_setup/down.sql
new file mode 100644
index 0000000..a9f5260
--- /dev/null
+++ b/sql/migrations/00000000000000_diesel_initial_setup/down.sql
@@ -0,0 +1,6 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
+DROP FUNCTION IF EXISTS diesel_set_updated_at();
diff --git a/sql/migrations/00000000000000_diesel_initial_setup/up.sql b/sql/migrations/00000000000000_diesel_initial_setup/up.sql
new file mode 100644
index 0000000..d68895b
--- /dev/null
+++ b/sql/migrations/00000000000000_diesel_initial_setup/up.sql
@@ -0,0 +1,36 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+
+
+
+-- Sets up a trigger for the given table to automatically set a column called
+-- `updated_at` whenever the row is modified (unless `updated_at` was included
+-- in the modified columns)
+--
+-- # Example
+--
+-- ```sql
+-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
+--
+-- SELECT diesel_manage_updated_at('users');
+-- ```
+CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
+BEGIN
+ EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
+ FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
+BEGIN
+ IF (
+ NEW IS DISTINCT FROM OLD AND
+ NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
+ ) THEN
+ NEW.updated_at := current_timestamp;
+ END IF;
+ RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/sql/migrations/2019-12-19-060141_init/down.sql b/sql/migrations/2019-12-19-060141_init/down.sql
new file mode 100644
index 0000000..a085480
--- /dev/null
+++ b/sql/migrations/2019-12-19-060141_init/down.sql
@@ -0,0 +1,8 @@
+
+DROP TABLE IF NOT EXISTS cdx;
+DROP TABLE IF NOT EXISTS file_meta;
+DROP TABLE IF NOT EXISTS fatcat_file;
+DROP TABLE IF NOT EXISTS petabox;
+DROP TABLE IF NOT EXISTS grobid;
+DROP TABLE IF NOT EXISTS ingest_request;
+DROP TABLE IF NOT EXISTS shadow;
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
new file mode 100644
index 0000000..33dba66
--- /dev/null
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -0,0 +1,245 @@
+
+-- rows *may* be revisit records; indicated by mimetype == "warc/revisit"
+-- records are implied to be 200 status (or 226 for ftp); either direct hits or
+-- revisits
+-- there is nothing to prevent duplicate hits. eg, same sha1, same url, many
+-- datetimes. import scripts should take efforts to reduce this sort of
+-- duplication though. one row per *domain*/sha1hex pair is a good guideline.
+-- all ingest result url/dt pairs should be included though.
+-- any mimetype is allowed, but presumption should be that actual body is full
+-- manifestation of a work. AKA, no landing pages, no webcapture HTML (each
+-- only a part of work). URLs that are parts of a fileset are allowed.
+CREATE TABLE IF NOT EXISTS cdx (
+ url TEXT NOT NULL CHECK (octet_length(url) >= 1),
+ datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14),
+ -- sha1hex/cdx_sha1hex difference is intended to help with difference between
+ -- CDX hash (which is transport encoded body) vs. actual body. Probably need to
+ -- include both for all records?
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1),
+ -- TODO: enforce that only paths with '/' (item+file) should be included?
+ warc_path TEXT CHECK (octet_length(warc_path) >= 1),
+ warc_csize BIGINT,
+ warc_offset BIGINT,
+ row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ PRIMARY KEY(url, datetime)
+);
+CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
+-- TODO: remove this index? not currently used
+CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
+
+-- TODO: require all fields. if mimetype unknown, should be octet-stream
+CREATE TABLE IF NOT EXISTS file_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ sha256hex TEXT CHECK (octet_length(sha256hex) = 64),
+ md5hex TEXT CHECK (octet_length(md5hex) = 32),
+ size_bytes BIGINT,
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1)
+);
+CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex);
+
+CREATE TABLE IF NOT EXISTS fatcat_file (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ file_ident TEXT CHECK (octet_length(file_ident) = 26),
+ first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26),
+ any_url BOOLEAN,
+ content_scope TEXT CHECK (octet_length(content_scope) >= 1)
+);
+
+CREATE TABLE IF NOT EXISTS petabox (
+ item TEXT NOT NULL CHECK (octet_length(item) >= 1),
+ path TEXT NOT NULL CHECK (octet_length(path) >= 1),
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ PRIMARY KEY(item, path)
+);
+CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex);
+
+CREATE TABLE IF NOT EXISTS grobid (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ grobid_version TEXT CHECK (octet_length(grobid_version) >= 1),
+ status_code INT NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1),
+ fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26),
+ -- extracted basic biblio metadata:
+ -- title
+ -- authors[]
+ -- full/display
+ -- given_name
+ -- surname
+ -- affiliation
+ -- year
+ -- journal_issn
+ -- journal_name
+ -- refs_count
+ metadata JSONB
+);
+-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release);
+
+CREATE TABLE IF NOT EXISTS pdftrio (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status_code INT NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1),
+ models_date DATE,
+ ensemble_score REAL,
+ bert_score REAL,
+ linear_score REAL,
+ image_score REAL
+);
+
+CREATE TABLE IF NOT EXISTS pdf_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ has_page0_thumbnail BOOLEAN NOT NULL,
+ page_count INT CHECK (page_count >= 0),
+ word_count INT CHECK (word_count >= 0),
+ page0_height REAL CHECK (page0_height >= 0),
+ page0_width REAL CHECK (page0_width >= 0),
+ permanent_id TEXT CHECK (octet_length(permanent_id) >= 1),
+ pdf_created TIMESTAMP WITH TIME ZONE,
+ pdf_version TEXT CHECK (octet_length(pdf_version) >= 1),
+ metadata JSONB
+ -- maybe some analysis of available fields?
+ -- metadata JSON fields:
+ -- title
+ -- subject
+ -- author
+ -- creator
+ -- producer
+ -- CrossMarkDomains
+ -- doi
+ -- form
+ -- encrypted
+);
+
+CREATE TABLE IF NOT EXISTS html_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ scope TEXT CHECK (octet_length(status) >= 1),
+ has_teixml BOOLEAN NOT NULL,
+ has_thumbnail BOOLEAN NOT NULL,
+ word_count INT CHECK (word_count >= 0),
+ biblio JSONB,
+ resources JSONB
+ -- biblio JSON fields are similar to fatcat release schema
+ -- resources JSON object is a list of objects with keys like webcapture CDX schema
+);
+
+CREATE TABLE IF NOT EXISTS ingest_request (
+ link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
+ link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
+
+ ingest_request_source TEXT CHECK (octet_length(ingest_request_source) >= 1),
+ created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ release_stage TEXT CHECK (octet_length(release_stage) >= 1),
+ request JSONB,
+ -- request isn't required, but can stash extra fields there for import, eg:
+ -- ext_ids (source/source_id sometimes enough)
+ -- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN)
+ -- edit_extra
+ -- ingest type can be: pdf, xml, html
+
+ PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)
+);
+CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type);
+CREATE INDEX ingest_request_source_created_idx ON ingest_request(ingest_request_source, created);
+
+CREATE TABLE IF NOT EXISTS ingest_file_result (
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
+
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ hit BOOLEAN NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1),
+ terminal_url TEXT CHECK (octet_length(terminal_url) >= 1),
+ terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14),
+ terminal_status_code INT,
+ terminal_sha1hex TEXT CHECK (octet_length(terminal_sha1hex) = 40),
+
+ PRIMARY KEY (ingest_type, base_url)
+);
+CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url);
+CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex);
+
+CREATE TABLE IF NOT EXISTS ingest_fileset_platform (
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ hit BOOLEAN NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1),
+
+ platform_name TEXT NOT NULL CHECK (octet_length(platform_name) >= 1),
+ platform_domain TEXT NOT NULL CHECK (octet_length(platform_domain) >= 1),
+ platform_id TEXT NOT NULL CHECK (octet_length(platform_id) >= 1),
+ ingest_strategy TEXT CHECK (octet_length(ingest_strategy) >= 1),
+ total_size BIGINT,
+ file_count BIGINT,
+ archiveorg_item_name TEXT CHECK (octet_length(archiveorg_item_name) >= 1),
+
+ archiveorg_item_bundle_path TEXT CHECK (octet_length(archiveorg_item_bundle_path) >= 1),
+ web_bundle_url TEXT CHECK (octet_length(web_bundle_url) >= 1),
+ web_bundle_dt TEXT CHECK (octet_length(web_bundle_dt) = 14),
+
+ manifest JSONB,
+ -- list, similar to fatcat fileset manifest, plus extra:
+ -- status (str)
+ -- path (str)
+ -- size (int)
+ -- md5 (str)
+ -- sha1 (str)
+ -- sha256 (str)
+ -- mimetype (str)
+ -- extra (dict)
+ -- platform_url (str)
+ -- terminal_url (str)
+ -- terminal_dt (str)
+
+ PRIMARY KEY (ingest_type, base_url)
+);
+CREATE INDEX ingest_fileset_platform_name_domain_id_idx ON ingest_fileset_platform(platform_name, platform_domain, platform_id);
+
+CREATE TABLE IF NOT EXISTS shadow (
+ shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
+ shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1),
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ doi TEXT CHECK (octet_length(doi) >= 1),
+ pmid TEXT CHECK (octet_length(pmid) >= 1),
+ isbn13 TEXT CHECK (octet_length(isbn13) >= 1),
+ PRIMARY KEY(shadow_corpus, shadow_id)
+);
+CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex);
+
+CREATE TABLE IF NOT EXISTS crossref (
+ doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)),
+ indexed TIMESTAMP WITH TIME ZONE NOT NULL,
+ record JSON NOT NULL,
+ PRIMARY KEY(doi)
+);
+
+CREATE TABLE IF NOT EXISTS grobid_refs (
+ source TEXT NOT NULL CHECK (octet_length(source) >= 1),
+ source_id TEXT NOT NULL CHECK (octet_length(source_id) >= 1),
+ source_ts TIMESTAMP WITH TIME ZONE,
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ refs_json JSON NOT NULL,
+ PRIMARY KEY(source, source_id)
+);
+
+CREATE OR REPLACE VIEW crossref_with_refs (doi, indexed, record, source_ts, refs_json) AS
+ SELECT
+ crossref.doi as doi,
+ crossref.indexed as indexed,
+ crossref.record as record,
+ grobid_refs.source_ts as source_ts,
+ grobid_refs.refs_json as refs_json
+ FROM crossref
+ LEFT JOIN grobid_refs ON
+ grobid_refs.source_id = crossref.doi
+ AND grobid_refs.source = 'crossref';
diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md
new file mode 100644
index 0000000..0859e79
--- /dev/null
+++ b/sql/monitoring_queries.md
@@ -0,0 +1,202 @@
+
+## fatcat-changelog pipeline
+
+Overall ingest status, past 30 days:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+Broken domains, past 30 days:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+Summary of significant domains and status, past 7 days:
+
+ SELECT domain, status, count
+ FROM (
+ SELECT domain, status, COUNT((domain, status)) as count
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.updated >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY CUBE (domain, status)
+ ) t2
+ WHERE count > 200
+ ORDER BY domain ASC , count DESC;
+
+Summary of DOI prefix and status, past 7 days:
+
+ SELECT doi_prefix, status, count
+ FROM (
+ SELECT doi_prefix, status, COUNT((doi_prefix, status)) as count
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_request.link_source_id FROM '(10\.[^/]*)/.*') AS doi_prefix
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.updated >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ AND ingest_request.link_source = 'doi'
+ ) t1
+ WHERE t1.doi_prefix != ''
+ GROUP BY CUBE (doi_prefix, status)
+ ) t2
+ WHERE count > 200
+ ORDER BY doi_prefix ASC , count DESC;
+
+
+Throughput per day, and success, for past 30 days:
+
+ SELECT ingest_request.ingest_type,
+ date(ingest_request.created),
+ COUNT(*) as total,
+ COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created)
+ ORDER BY date(ingest_request.created) DESC;
+
+## fatcat-ingest
+
+Broken domains, past 7 days:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '7 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+Throughput per day, and success, for past 7 days:
+
+ SELECT ingest_request.ingest_type,
+ date(ingest_file_result.updated),
+ COUNT(*) as total,
+ COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '7 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated)
+ ORDER BY date(ingest_file_result.updated) DESC;
+
+Overall status, updated requests past 3 days:
+
+ SELECT ingest_request.ingest_type,
+ ingest_file_result.status,
+ COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_file_result.updated >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '48 hour'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.status
+ ORDER BY COUNT(*) DESC;
+
+## savepapernow and fatcat-ingest recent status
+
+Specific recent ingests (for debugging):
+
+ -- for record layout: \x
+ SELECT
+ ingest_file_result.status as status,
+ ingest_request.ingest_type as ingest_type,
+ ingest_request.ingest_request_source as source,
+ ingest_request.link_source_id as source_id,
+ ingest_request.base_url as base_url,
+ ingest_file_result.terminal_dt as dt,
+ ingest_file_result.terminal_status_code as status_code,
+ ingest_file_result.terminal_sha1hex as sha1hex,
+ grobid.status as grobid_status
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ LEFT JOIN grobid
+ ON ingest_file_result.terminal_sha1hex = grobid.sha1hex
+ WHERE
+ ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL
+ -- AND ingest_request.ingest_type = 'pdf'
+ -- AND ingest_request.ingest_type = 'html'
+ AND (
+ ingest_request.ingest_request_source = 'savepapernow-web'
+ -- OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ )
+ ORDER BY ingest_file_result.updated DESC
+ LIMIT 100;
+
diff --git a/sql/pdftrio_queries.md b/sql/pdftrio_queries.md
new file mode 100644
index 0000000..06f718c
--- /dev/null
+++ b/sql/pdftrio_queries.md
@@ -0,0 +1,65 @@
+
+## Counts / Status
+
+ SELECT status_code, COUNT(*) FROM pdftrio GROUP BY status_code;
+
+ # NOTE: I earlier deleted a large fraction of non-200 status codes, so
+ # these aren't representative
+ status_code | count
+ -------------+---------
+ -4 | 16
+ -2 | 26
+ 200 | 1117501
+ 400 | 2695
+ (4 rows)
+
+
+ SELECT status, COUNT(*) FROM pdftrio GROUP BY status;
+
+ status | count
+ ---------------+---------
+ error | 2696
+ error-connect | 26
+ error-timeout | 16
+ success | 1118252
+ (4 rows)
+
+ SELECT
+ COUNT(CASE WHEN ensemble_score IS NOT NULL THEN 1 ELSE NULL END) as ensemble_count,
+ COUNT(CASE WHEN linear_score IS NOT NULL THEN 1 ELSE NULL END) as linear_count,
+ COUNT(CASE WHEN bert_score IS NOT NULL THEN 1 ELSE NULL END) as bert_count,
+ COUNT(CASE WHEN image_score IS NOT NULL THEN 1 ELSE NULL END) as image_count
+ FROM pdftrio;
+
+
+ ensemble_count | linear_count | bert_count | image_count
+ ----------------+--------------+------------+-------------
+ 1120100 | 976271 | 66209 | 143829
+ (1 row)
+
+## Histograms
+
+ SELECT width_bucket(ensemble_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
+ WHERE status = 'success'
+ AND ensemble_score IS NOT NULL
+ GROUP BY buckets
+ ORDER BY buckets;
+
+ SELECT width_bucket(bert_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
+ WHERE status = 'success'
+ AND bert_score IS NOT NULL
+ GROUP BY buckets
+ ORDER BY buckets;
+
+ SELECT width_bucket(linear_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
+ WHERE status = 'success'
+ AND linear_score IS NOT NULL
+ GROUP BY buckets
+ ORDER BY buckets;
+
+ SELECT width_bucket(image_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
+ WHERE status = 'success'
+ AND image_score IS NOT NULL
+ GROUP BY buckets
+ ORDER BY buckets;
+
diff --git a/sql/random_queries.md b/sql/random_queries.md
new file mode 100644
index 0000000..572b4f9
--- /dev/null
+++ b/sql/random_queries.md
@@ -0,0 +1,193 @@
+
+Basic stats (2019-09-23):
+
+ SELECT COUNT(*) FROM cdx WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex);
+ => 28,023,760
+ => Time: 253897.213 ms (04:13.897)
+
+ SELECT COUNT(DISTINCT sha1hex) FROM cdx WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex);
+ => 22,816,087
+ => Time: 287097.944 ms (04:47.098)
+
+ SELECT COUNT(*) FROM grobid.
+ => 56,196,992
+
+ SELECT COUNT(DISTINCT sha1hex) FROM cdx;
+ => 64,348,277
+ => Time: 572383.931 ms (09:32.384)
+
+ SELECT COUNT(*) FROM cdx;
+ => 74,796,777
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC;
+ => Time: 189067.335 ms (03:09.067)
+
+ mimetype | count
+ ------------------------+----------
+ application/pdf | 51049905
+ text/html | 24841846
+ text/xml | 524682
+ application/postscript | 81009
+ (4 rows)
+
+Time: 189067.335 ms (03:09.067)
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY count(*) DESC;
+
+ status_code | count
+ -------------+----------
+ 200 | 56196992
+
+ compare with older sandcrawler/output-prod/2019-05-28-1920.35-statuscodecount:
+
+ 200 49567139
+ 400 3464503
+ 409 691917
+ 500 247028
+ 503 123
+
+ SELECT row_to_json(cdx) FROM cdx LIMIT 5;
+
+ SELECT row_to_json(r) FROM (
+ SELECT url, datetime FROM cdx
+ ) r
+ LIMIT 5;
+
+More stats (2019-12-27):
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 20;
+
+ SELECT SUM(size_bytes) FROM file_meta;
+
+"Last 24 hour progress":
+
+ # "problem domains" and statuses
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.updated >= NOW() - '1 day'::INTERVAL
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 10;
+
+ # "what type of errors"
+ SELECT ingest_type, status, COUNT(*)
+ FROM ingest_file_result
+ WHERE updated >= NOW() - '1 day'::INTERVAL
+ GROUP BY ingest_type, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ # "throughput per day for last N days"
+ SELECT ingest_type,
+ date(updated),
+ COUNT(*) as total,
+ COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ WHERE updated >= NOW() - '1 month'::INTERVAL
+ GROUP BY ingest_type, date(updated)
+ ORDER BY date(updated) DESC;
+
+## Parse URLs
+
+One approach is to do regexes, something like:
+
+ SELECT substring(column_name FROM '[^/]+://([^/]+)/') AS domain_name FROM table_name;
+
+Eg:
+
+ SELECT DISTINCT(domain), COUNT(domain)
+ FROM (select substring(base_url FROM '[^/]+://([^/]*)') as domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ GROUP BY domain
+ ORDER BY COUNT DESC
+ LIMIT 10;
+
+Or:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 10;
+
+Can also do some quick lookups for a specific domain and protocol like:
+
+ SELECT *
+ FROM ingest_file_result
+ WHERE terminal_url LIKE 'https://insights.ovid.com/%'
+ LIMIT 10;
+
+For a given DOI prefix:
+
+ SELECT *
+ FROM ingest_file_result
+ WHERE base_url LIKE 'https://doi.org/10.17223/a%'
+ AND status = 'no-pdf-link'
+ LIMIT 10;
+
+ SELECT status, count(*)
+ FROM ingest_file_result
+ WHERE base_url LIKE 'https://doi.org/10.17223/%'
+ GROUP BY status
+ ORDER BY count(*) DESC;
+
+## Bulk Ingest
+
+Show bulk ingest status on links *added* in the past week:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+Top *successful* domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+Summarize non-success domains for the same:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 20;
diff --git a/sql/reingest_bulk.sh b/sql/reingest_bulk.sh
new file mode 100755
index 0000000..d39a171
--- /dev/null
+++ b/sql/reingest_bulk.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_bulk.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_bulk_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_bulk_current.json
+
+cat /srv/sandcrawler/tasks/reingest_bulk_current.json \
+ | shuf \
+ | head -n1000000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/sql/reingest_old.sh b/sql/reingest_old.sh
new file mode 100755
index 0000000..96e5416
--- /dev/null
+++ b/sql/reingest_old.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_old.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_old_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_old_current.json
+
+cat /srv/sandcrawler/tasks/reingest_old_current.json \
+ | shuf \
+ | head -n1000000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh
new file mode 100755
index 0000000..8a2996c
--- /dev/null
+++ b/sql/reingest_quarterly.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_quarterly_current.json
+
+cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \
+ | shuf \
+ | head -n120000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh
new file mode 100755
index 0000000..c693a64
--- /dev/null
+++ b/sql/reingest_spn.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_spn.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn.rows.json \
+ > /srv/sandcrawler/tasks/reingest_spn.json
+
+cat /srv/sandcrawler/tasks/reingest_spn.json \
+ | shuf \
+ | head -n60000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
diff --git a/sql/reingest_terminalstatus_forcerecrawl.sh b/sql/reingest_terminalstatus_forcerecrawl.sh
new file mode 100755
index 0000000..5cb6d51
--- /dev/null
+++ b/sql/reingest_terminalstatus_forcerecrawl.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_terminalstatus.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_terminalstatus_current.json
+
+cat /srv/sandcrawler/tasks/reingest_terminalstatus_current.json \
+ | shuf \
+ | head -n100000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh
new file mode 100755
index 0000000..d2e2444
--- /dev/null
+++ b/sql/reingest_weekly.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_weekly_current.json
+
+cat /srv/sandcrawler/tasks/reingest_weekly_current.json \
+ | shuf \
+ | head -n80000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
diff --git a/sql/sandcrawler_schema.sql b/sql/sandcrawler_schema.sql
index fd921ed..a3756d4 100644..120000
--- a/sql/sandcrawler_schema.sql
+++ b/sql/sandcrawler_schema.sql
@@ -1,59 +1 @@
-
-CREATE TABLE IF NOT EXISTS cdx (
- url TEXT NOT NULL CHECK (octet_length(url) >= 1),
- datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14),
- sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
- cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
- mimetype TEXT CHECK (octet_length(mimetype) >= 1),
- warc_path TEXT CHECK (octet_length(warc_path) >= 1),
- warc_csize BIGINT,
- warc_offset BIGINT,
- row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
- PRIMARY KEY(url, datetime)
-);
-CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
-CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
-
-CREATE TABLE IF NOT EXISTS file_meta (
- sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
- sha256hex TEXT CHECK (octet_length(sha256hex) = 64),
- md5hex TEXT CHECK (octet_length(md5hex) = 32),
- size_bytes BIGINT,
- mimetype TEXT CHECK (octet_length(mimetype) >= 1)
-);
-
-CREATE TABLE IF NOT EXISTS fatcat_file (
- sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
- file_ident TEXT CHECK (octet_length(file_ident) = 26),
- first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26)
-);
-
-CREATE TABLE IF NOT EXISTS petabox (
- item TEXT NOT NULL CHECK (octet_length(item) >= 1),
- path TEXT NOT NULL CHECK (octet_length(path) >= 1),
- sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
- PRIMARY KEY(item, path)
-);
-CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex);
-
-CREATE TABLE IF NOT EXISTS grobid (
- sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
- updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
- grobid_version TEXT CHECK (octet_length(grobid_version) >= 1),
- status_code INT NOT NULL,
- status TEXT CHECK (octet_length(status) >= 1),
- fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26),
- metadata JSONB
-);
--- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release);
-
-CREATE TABLE IF NOT EXISTS shadow (
- shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
- shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1),
- sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
- doi TEXT CHECK (octet_length(doi) >= 1),
- pmid TEXT CHECK (octet_length(pmid) >= 1),
- isbn13 TEXT CHECK (octet_length(isbn13) >= 1),
- PRIMARY KEY(shadow_corpus, shadow_id)
-);
-CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex);
+migrations/2019-12-19-060141_init/up.sql \ No newline at end of file
diff --git a/sql/stats/2020-01-13_stats.txt b/sql/stats/2020-01-13_stats.txt
new file mode 100644
index 0000000..444e448
--- /dev/null
+++ b/sql/stats/2020-01-13_stats.txt
@@ -0,0 +1,190 @@
+
+## SQL Table Sizes
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 36 GB | 78 GB
+ "public"."grobid" | 38 GB | 7076 MB | 45 GB
+ "public"."file_meta" | 23 GB | 11 GB | 34 GB
+ "public"."shadow" | 8303 MB | 9216 MB | 17 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."ingest_file_result" | 566 MB | 749 MB | 1314 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ "public"."ingest_request" | 363 MB | 625 MB | 988 MB
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+
+ total_count | total_size
+ -------------+-----------------
+ 118823340 | 140917467253923
+ (1 row)
+
+ # 118,823,340 => 118 million
+ # 140,917,467,253,923 => ~141 TByte
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 117185567
+ | 1509149
+ application/octet-stream | 87783
+ text/html | 9901
+ application/postscript | 3781
+ application/vnd.ms-powerpoint | 1421
+ text/plain | 1151
+ application/xml | 427
+ application/gzip | 414
+ application/msword | 314
+ (10 rows)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 96141851 | 110030179
+ (1 row)
+
+ # 96,141,851
+ # 110,030,179
+
+Top mimetypes (not unique by sha1):
+
+ mimetype | count
+ ------------------------+----------
+ application/pdf | 84582642
+ text/html | 24841846
+ text/xml | 524682
+ application/postscript | 81009
+ (4 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
+
+ unique_releases | total
+ -----------------+----------
+ 13675190 | 59919772
+
+ # 13,675,190
+ # 59,919,772
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+ status_code | count
+ -------------+----------
+ 200 | 57382904
+ 500 | 2536862
+ 503 | 6
+ (3 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 41699385
+ | 15683279
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+ # 2,868,825
+ # 2,887,834
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doi | 2816171
+ pdf | arxiv | 154448
+ pdf | spn | 55
+ pdf | pubmed | 2
+ (4 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+
+ ingest_type | link_source | count
+ -------------+-------------+-------
+ (0 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-------------+----------+--------+----------
+ pdf | doi | 2816171 | 289199 | 0.103
+ pdf | arxiv | 154448 | 41105 | 0.266
+ pdf | spn | 55 | 46 | 0.836
+ pdf | pubmed | 2 | 0 | 0.000
+ (4 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+---------------------+---------
+ pdf | no-pdf-link | 2213720
+ pdf | success | 330492
+ pdf | spn-remote-error | 182157
+ pdf | spn-error | 141222
+ pdf | cdx-error | 83131
+ pdf | link-loop | 11350
+ pdf | other-mimetype | 6089
+ pdf | null-body | 1980
+ pdf | terminal-bad-status | 583
+ pdf | wayback-error | 381
+ (10 rows)
+
diff --git a/sql/stats/2020-01-31_supplement.txt b/sql/stats/2020-01-31_supplement.txt
new file mode 100644
index 0000000..6bd43ea
--- /dev/null
+++ b/sql/stats/2020-01-31_supplement.txt
@@ -0,0 +1,42 @@
+
+How many file_meta still missing core metadata?
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+ => 1,130,915
+
+Great! Not many.
+
+And are in petabox?
+
+ SELECT COUNT(*)
+ FROM file_meta
+ LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex
+ WHERE file_meta.sha256hex IS NULL
+ AND file_meta.sha1hex IS NOT NULL;
+ => 1,149,194
+
+Almost all; maybe just some CDX fetch failures or something in there. So,
+should run these on, eg, grobid2-vm.
+
+ COPY (
+ SELECT row_to_json(petabox.*)
+ FROM file_meta
+ LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex
+ WHERE file_meta.sha256hex IS NULL
+ AND file_meta.sha1hex IS NOT NULL
+ ) TO '/grande/snapshots/dump_grobid_petabox_todo.json';
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file` (note: `fatcat_file` is out of date by a
+couple million files):
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ total_count | count
+ -------------+---------
+ 5072452 | 4130405
+
diff --git a/sql/stats/2020-02-24_stats.txt b/sql/stats/2020-02-24_stats.txt
new file mode 100644
index 0000000..e7a00e8
--- /dev/null
+++ b/sql/stats/2020-02-24_stats.txt
@@ -0,0 +1,482 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ Size: 271.83G
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 36 GB | 78 GB
+ "public"."grobid_shadow" | 61 GB | 6553 MB | 68 GB
+ "public"."grobid" | 47 GB | 7213 MB | 54 GB
+ "public"."file_meta" | 26 GB | 12 GB | 38 GB
+ "public"."shadow" | 8303 MB | 9216 MB | 17 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."ingest_file_result" | 1831 MB | 2454 MB | 4285 MB
+ "public"."ingest_request" | 2006 MB | 2122 MB | 4128 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ "public"."pdftrio" | 78 MB | 64 MB | 142 MB
+ (10 rows)
+
+
+## File Metadata
+
+(skipping, no update)
+
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+Processed or not:
+
+ # TODO:
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
+
+ unique_releases | total
+ -----------------+----------
+ 15,632,810 | 76,555,791
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+ status_code | count
+ -------------+----------
+ 200 | 70656028
+ 500 | 5896836
+ -4 | 2295
+ 503 | 111
+ (4 rows)
+
+ What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 56001631
+ | 14654496
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2,868,825 | 2,887,834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doi | 6591633
+ pdf | pmc | 2030279
+ pdf | arxiv | 630743
+ pdf | unpaywall | 1400
+ pdf | spn | 82
+ pdf | pubmed | 2
+ (6 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-------------+-------------------------+---------
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | doi | | 2943896
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | | 629719
+ pdf | doi | fatcat-changelog | 129932
+ pdf | doi | fatcat-ingest | 1935
+ pdf | pmc | | 1454
+ pdf | unpaywall | unpaywall | 1400
+ pdf | arxiv | fatcat-ingest | 998
+ pdf | spn | | 64
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | spn | savepapernow-web | 18
+ pdf | pubmed | | 2
+ pdf | doi | savepapernow-web | 1
+ (14 rows)
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'fatcat-changelog' WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL;
+ => UPDATE 2943896
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'savepapernow-web' WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL;
+ => UPDATE 64
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL;
+ => UPDATE 629719
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL;
+ => UPDATE 1454
+
+ SELECT count(*) FROM ingest_request WHERE link_source = 'pubmed';
+ DELETE FROM ingest_request WHERE link_source = 'pubmed';
+ => DELETE 2
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doi | 6591637
+ pdf | pmc | 2030279
+ pdf | arxiv | 630743
+ pdf | unpaywall | 1400
+ pdf | spn | 82
+ (5 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-------------+-------------------------+---------
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | doi | fatcat-changelog | 3073828
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 630717
+ pdf | doi | fatcat-ingest | 1935
+ pdf | pmc | fatcat-ingest | 1454
+ pdf | unpaywall | unpaywall | 1400
+ pdf | spn | savepapernow-web | 82
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 1
+ (10 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+ none?
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-------------+----------+---------+----------
+ pdf | doi | 6591637 | 1622702 | 0.246
+ pdf | pmc | 2030279 | 1241836 | 0.612
+ pdf | arxiv | 630743 | 500620 | 0.794
+ pdf | unpaywall | 1400 | 851 | 0.608
+ pdf | spn | 82 | 62 | 0.756
+ (5 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+---------
+ pdf | success | 3366189
+ pdf | no-pdf-link | 2902620
+ pdf | no-capture | 1672025
+ pdf | redirect-loop | 388844
+ pdf | cdx-error | 272780
+ pdf | terminal-bad-status | 171878
+ pdf | spn-remote-error | 163843
+ pdf | spn-error | 108070
+ pdf | null-body | 66778
+ pdf | link-loop | 43403
+ pdf | skip-url-blocklist | 34705
+ pdf | wrong-mimetype | 31343
+ pdf | wayback-error | 13012
+ pdf | spn2-cdx-lookup-failure | 6100
+ pdf | gateway-timeout | 5633
+ pdf | other-mimetype | 5114
+ pdf | spn2-error:proxy-error | 538
+ pdf | spn2-error:job-failed | 470
+ pdf | petabox-error | 415
+ pdf | spn2-error:browser-running-error | 136
+ pdf | spn2-error | 127
+ pdf | spn2-error:soft-time-limit-exceeded | 71
+ pdf | bad-redirect | 39
+ pdf | spn2-error:unknown | 30
+ pdf | spn2-error:browsing-timeout | 25
+ pdf | pending | 3
+ pdf | invalid-host-resolution | 1
+ (27 rows)
+
+
+## Fatcat Files
+
+(skipping, no update)
+
+## Recent Success/Failure of Ingest by Domain
+
+NOTE: just finished a bunch of "backfill" ingest from OA-DOI crawl; only a
+small fraction of this is from changelog.
+
+ # "problem domains" and statuses
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.updated >= NOW() - '1 day'::INTERVAL
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 10;
+
+ domain | status | count
+ -------------------------+----------------+-------
+ linkinghub.elsevier.com | no-capture | 2579
+ www.mdpi.com | wrong-mimetype | 1313
+ onlinelibrary.wiley.com | no-pdf-link | 785
+ americanarchivist.org | no-pdf-link | 756
+ journals.sagepub.com | redirect-loop | 503
+ link.springer.com | redirect-loop | 432
+ iopscience.iop.org | no-capture | 392
+ www.tandfonline.com | no-pdf-link | 389
+ pubs.rsc.org | no-capture | 361
+ www.persee.fr | no-capture | 344
+ (10 rows)
+
+
+ # "what type of errors"
+ SELECT ingest_type, status, COUNT(*)
+ FROM ingest_file_result
+ WHERE updated >= NOW() - '1 day'::INTERVAL
+ GROUP BY ingest_type, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+-------
+ pdf | success | 40578
+ pdf | cdx-error | 14982
+ pdf | no-capture | 7747
+ pdf | no-pdf-link | 7111
+ pdf | redirect-loop | 3265
+ pdf | wrong-mimetype | 1629
+ pdf | spn2-cdx-lookup-failure | 657
+ pdf | link-loop | 538
+ pdf | null-body | 517
+ pdf | terminal-bad-status | 400
+ pdf | wayback-error | 79
+ pdf | spn2-error:job-failed | 53
+ pdf | gateway-timeout | 38
+ pdf | spn2-error:soft-time-limit-exceeded | 7
+ pdf | spn2-error | 6
+ pdf | petabox-error | 5
+ pdf | spn2-error:browsing-timeout | 4
+ pdf | spn2-error:unknown | 2
+ pdf | bad-redirect | 1
+ pdf | pending | 1
+ (20 rows)
+
+ # "throughput per day for last N days"
+ SELECT ingest_type,
+ date(updated),
+ COUNT(*) as total,
+ COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ WHERE updated >= NOW() - '1 month'::INTERVAL
+ GROUP BY ingest_type, date(updated)
+ ORDER BY date(updated) DESC;
+
+ ingest_type | date | total | success
+ -------------+------------+---------+---------
+ pdf | 2020-02-25 | 32660 | 14322
+ pdf | 2020-02-24 | 44967 | 26263
+ pdf | 2020-02-23 | 58795 | 18874
+ pdf | 2020-02-22 | 844249 | 272606
+ pdf | 2020-02-21 | 1287378 | 433487
+ pdf | 2020-02-20 | 1455943 | 492408
+ pdf | 2020-02-19 | 21453 | 7529
+ pdf | 2020-02-18 | 5863 | 2926
+ pdf | 2020-02-17 | 3737 | 970
+ pdf | 2020-02-16 | 13779 | 4862
+ pdf | 2020-02-15 | 1021020 | 623020
+ pdf | 2020-02-14 | 1036036 | 632830
+ pdf | 2020-02-13 | 13503 | 5824
+ pdf | 2020-02-12 | 20078 | 11422
+ pdf | 2020-02-11 | 13499 | 6781
+ pdf | 2020-02-10 | 2275 | 961
+ pdf | 2020-02-09 | 3231 | 1494
+ pdf | 2020-02-08 | 8967 | 4400
+ pdf | 2020-02-07 | 7022 | 2430
+ pdf | 2020-02-06 | 1291 | 516
+ pdf | 2020-02-05 | 8586 | 6596
+ pdf | 2020-02-04 | 3681 | 3593
+ pdf | 2020-02-03 | 284 | 284
+ pdf | 2020-02-02 | 480 | 480
+ pdf | 2020-02-01 | 489 | 336
+ pdf | 2020-01-31 | 1187 | 1130
+ pdf | 2020-01-30 | 1613 | 1288
+ pdf | 2020-01-29 | 947 | 279
+ pdf | 2020-01-28 | 667 | 323
+ (29 rows)
+
+Top "no-capture" domains (will need to re-ingest using live tool):
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ --------------------------+------------+--------
+ linkinghub.elsevier.com | no-capture | 320065
+ iopscience.iop.org | no-capture | 46858
+ pubs.rsc.org | no-capture | 43331
+ www.persee.fr | no-capture | 38971
+ www.doiserbia.nb.rs | no-capture | 27112
+ academic.oup.com | no-capture | 18877
+ www.osapublishing.org | no-capture | 17113
+ osf.io | no-capture | 16978
+ scripts.iucr.org | no-capture | 14844
+ www.degruyter.com | no-capture | 8093
+ mab-online.nl | no-capture | 6603
+ insights.ovid.com | no-capture | 6457
+ ir.lib.uth.gr | no-capture | 3625
+ www.sciencedirect.com | no-capture | 3244
+ www.tandfonline.com | no-capture | 3201
+ www.ccsenet.org | no-capture | 2849
+ www.intechopen.com | no-capture | 2813
+ primary-hospital-care.ch | no-capture | 2774
+ www.nature.com | no-capture | 2484
+ www.indianjournals.com | no-capture | 2432
+ journals.aps.org | no-capture | 2197
+ journals.sagepub.com | no-capture | 2064
+ www.episodes.org | no-capture | 1805
+ periodicos.uninove.br | no-capture | 1692
+ escholarship.org | no-capture | 1666
+ (25 rows)
+
+Top "no-pdf-link" domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-pdf-link'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ -----------------------------+-------------+--------
+ plutof.ut.ee | no-pdf-link | 685315
+ www.gbif.org | no-pdf-link | 670647
+ doi.pangaea.de | no-pdf-link | 301984
+ www.plate-archive.org | no-pdf-link | 209218
+ onlinelibrary.wiley.com | no-pdf-link | 84890
+ figshare.com | no-pdf-link | 72892
+ zenodo.org | no-pdf-link | 45768
+ www.tandfonline.com | no-pdf-link | 43848
+ data.mendeley.com | no-pdf-link | 42367
+ springernature.figshare.com | no-pdf-link | 35941
+ dhz.uni-passau.de | no-pdf-link | 29187
+ www.frontiersin.org | no-pdf-link | 17925
+ digital.ucd.ie | no-pdf-link | 16769
+ mr.crossref.org | no-pdf-link | 14999
+ journals.lww.com | no-pdf-link | 12122
+ musewide.aip.de | no-pdf-link | 10854
+ datadryad.org | no-pdf-link | 10686
+ www.jstor.org | no-pdf-link | 9159
+ koreascience.or.kr | no-pdf-link | 9067
+ easy.dans.knaw.nl | no-pdf-link | 8264
+ scielo.conicyt.cl | no-pdf-link | 8069
+ www.degruyter.com | no-pdf-link | 7989
+ www.kci.go.kr | no-pdf-link | 6990
+ www.m-hikari.com | no-pdf-link | 6941
+ cshprotocols.cshlp.org | no-pdf-link | 6553
+ (25 rows)
+
+Top block-ish domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND (t1.status = 'redirect-loop' OR t1.status = 'link-loop' OR t1.status = 'terminal-bad-status')
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ ---------------------------------+---------------------+-------
+ journals.openedition.org | redirect-loop | 30395
+ ieeexplore.ieee.org | redirect-loop | 28926
+ www.degruyter.com | redirect-loop | 18891
+ www.cairn.info | link-loop | 8919
+ www.frontiersin.org | terminal-bad-status | 6786
+ projecteuclid.org | link-loop | 6098
+ www.mdpi.com | terminal-bad-status | 5189
+ medicalforum.ch | terminal-bad-status | 4596
+ jrnl.nau.edu.ua | link-loop | 4238
+ www.revistas.unam.mx | link-loop | 3926
+ journals.aps.org | redirect-loop | 3696
+ www.ijcseonline.org | redirect-loop | 3567
+ www.researchsquare.com | terminal-bad-status | 3453
+ www.persee.fr | terminal-bad-status | 3221
+ www.baltistica.lt | link-loop | 2098
+ osf.io | redirect-loop | 2004
+ seer.ufrgs.br | terminal-bad-status | 2002
+ jtd.amegroups.com | link-loop | 1738
+ www.hindawi.com | terminal-bad-status | 1613
+ linkinghub.elsevier.com | redirect-loop | 1612
+ www.scienceopen.com | terminal-bad-status | 1580
+ atm.amegroups.com | link-loop | 1571
+ scielo.conicyt.cl | terminal-bad-status | 1491
+ repozytorium.ur.edu.pl | redirect-loop | 1279
+ agupubs.onlinelibrary.wiley.com | link-loop | 1182
+ (25 rows)
+
diff --git a/sql/stats/2020-05-03_stats.txt b/sql/stats/2020-05-03_stats.txt
new file mode 100644
index 0000000..55f0c1e
--- /dev/null
+++ b/sql/stats/2020-05-03_stats.txt
@@ -0,0 +1,418 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 41 GB | 82 GB
+ "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB
+ "public"."grobid" | 59 GB | 7604 MB | 66 GB
+ "public"."file_meta" | 31 GB | 28 GB | 59 GB
+ "public"."ingest_request" | 19 GB | 20 GB | 39 GB
+ "public"."ingest_file_result" | 15 GB | 23 GB | 39 GB
+ "public"."shadow" | 9111 MB | 10204 MB | 19 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (10 rows)
+
+ Size: 383.93G
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 158059828 | 197346217653010
+ (1 row)
+
+ => 158 million, 197 terabytes
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 157805029
+ application/octet-stream | 154348
+ application/xml | 42170
+ text/html | 18703
+ text/plain | 15989
+ application/gzip | 6484
+ | 6040
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ (10 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ ---------
+ 1027125
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+
+ unique_sha1 | total
+ -------------+-----------
+ 92936564 | 111022039
+ (1 row)
+
+ => 110 million rows, 92.9 million files
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;
+
+ mimetype | count
+ ---------------------------------------------------------------------------------------------------------+-----------
+ application/pdf | 104178718
+ warc/revisit | 5274410
+ text/xml | 519042
+ text/html | 295523
+ application/octet-stream | 259681
+ unk | 138930
+ application/postscript | 81065
+ application/save | 80765
+ binary/octet-stream | 59804
+ application/x-download | 27083
+ text/plain | 26938
+ application/download | 25125
+ image/pdf | 16095
+ application/force-download | 9004
+ application/x-msdownload | 3711
+ application | 2934
+ application/x-octetstream | 2926
+ multipart/form-data | 2741
+ application/x-pdf | 2444
+ .pdf | 2368
+ application/binary | 1268
+ application/pdf' | 1192
+ pdf | 1113
+ file/unknown | 1086
+ application/unknown | 761
+ file | 753
+ application/blob | 670
+ application/octetstream | 657
+ text/pdf | 549
+ 0 | 417
+ ('application/pdf', | 349
+ application/http;msgtype=response | 251
+ application/doc | 180
+ [...] (wasn't LIMIT 25)
+
+Processed or not:
+
+ # TODO:
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
+
+
+ unique_releases | total
+ -----------------+----------
+ 17455441 | 92707544
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 71057023
+ | 14638425
+ (2 rows)
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status = 'success' GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 71057074
+ | 3
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | unpaywall | 26244088
+ pdf | mag | 25596658
+ pdf | doi | 15652966
+ pdf | pmc | 2043646
+ pdf | arxiv | 721902
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 103
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | unpaywall | unpaywall | 26244088
+ pdf | mag | mag-corpus | 25596658
+ pdf | doi | fatcat-ingest | 8267308
+ pdf | doi | fatcat-changelog | 3869772
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 630719
+ pdf | arxiv | fatcat-changelog | 91157
+ pdf | pmc | fatcat-ingest | 10195
+ pdf | pmc | fatcat-changelog | 4626
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 103
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 15
+ (15 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+-------
+ pdf | mag | 47
+ pdf | unpaywall | 1
+ (2 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | unpaywall | 26244088 | 19968092 | 0.761
+ pdf | mag | 25596658 | 18712912 | 0.731
+ pdf | doi | 15653166 | 2878833 | 0.184
+ pdf | pmc | 2043646 | 1279529 | 0.626
+ pdf | arxiv | 721902 | 592394 | 0.821
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 103 | 82 | 0.796
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+----------
+ pdf | success | 37449502
+ pdf | no-pdf-link | 10908442
+ pdf | no-capture | 5643670
+ pdf | redirect-loop | 4823502
+ pdf | terminal-bad-status | 1715056
+ pdf | link-loop | 1425072
+ pdf | cdx-error | 535365
+ pdf | gateway-timeout | 267654
+ pdf | skip-url-blocklist | 220433
+ pdf | wrong-mimetype | 189804
+ pdf | spn2-cdx-lookup-failure | 103926
+ pdf | spn-error | 101777
+ pdf | wayback-error | 93517
+ pdf | null-body | 87279
+ pdf | invalid-host-resolution | 35305
+ pdf | spn-remote-error | 28888
+ pdf | petabox-error | 12406
+ pdf | spn2-error | 2905
+ pdf | spn2-error:job-failed | 2307
+ pdf | other-mimetype | 2305
+ pdf | redirects-exceeded | 745
+ pdf | spn2-error:proxy-error | 438
+ pdf | spn2-error:invalid-url-syntax | 406
+ pdf | spn2-error:soft-time-limit-exceeded | 405
+ pdf | spn2-error:browser-running-error | 274
+ (25 rows)
+
+Failures by domain:
+
+ SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type as ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY ingest_type, domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | domain | status | count
+ -------------+---------------------------------------+---------------------+--------
+ pdf | ssl.fao.org | no-pdf-link | 862277
+ pdf | www.researchgate.net | redirect-loop | 749094
+ pdf | www.e-periodica.ch | no-pdf-link | 747370
+ pdf | ieeexplore.ieee.org | redirect-loop | 707482
+ pdf | plutof.ut.ee | no-pdf-link | 685341
+ pdf | www.gbif.org | no-pdf-link | 670905
+ pdf | dlc.library.columbia.edu | no-pdf-link | 508281
+ pdf | figshare.com | no-pdf-link | 400501
+ pdf | onlinelibrary.wiley.com | no-pdf-link | 399187
+ pdf | watermark.silverchair.com | terminal-bad-status | 357188
+ pdf | www.die-bonn.de | redirect-loop | 352903
+ pdf | academic.oup.com | no-pdf-link | 346828
+ pdf | iopscience.iop.org | terminal-bad-status | 345147
+ pdf | linkinghub.elsevier.com | no-capture | 328434
+ pdf | statisticaldatasets.data-planet.com | no-pdf-link | 312206
+ pdf | cyberleninka.ru | link-loop | 309525
+ pdf | www.tandfonline.com | no-pdf-link | 309146
+ pdf | dialnet.unirioja.es | terminal-bad-status | 307572
+ pdf | doi.pangaea.de | no-pdf-link | 304924
+ pdf | journals.sagepub.com | no-pdf-link | 285774
+ pdf | papers.ssrn.com | link-loop | 282415
+ pdf | dialnet.unirioja.es | redirect-loop | 274476
+ pdf | ieeexplore.ieee.org | link-loop | 273607
+ pdf | catalog.paradisec.org.au | redirect-loop | 234653
+ pdf | www.plate-archive.org | no-pdf-link | 209217
+ pdf | zenodo.org | no-pdf-link | 200078
+ pdf | zenodo.org | no-capture | 199025
+ pdf | spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 187084
+ pdf | digi.ub.uni-heidelberg.de | no-pdf-link | 187039
+ pdf | validate.perfdrive.com | no-pdf-link | 180191
+ (30 rows)
+
+Success by domain:
+
+ SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type as ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'success'
+ GROUP BY ingest_type, domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | domain | status | count
+ -------------+----------------------------+---------+---------
+ pdf | www.jstage.jst.go.jp | success | 2244620
+ pdf | europepmc.org | success | 1284770
+ pdf | link.springer.com | success | 1017998
+ pdf | www.scielo.br | success | 799577
+ pdf | arxiv.org | success | 592622
+ pdf | downloads.hindawi.com | success | 527278
+ pdf | res.mdpi.com | success | 501093
+ pdf | hal.archives-ouvertes.fr | success | 447877
+ pdf | digital.library.unt.edu | success | 404460
+ pdf | www.cambridge.org | success | 394666
+ pdf | dergipark.org.tr | success | 373706
+ pdf | journals.plos.org | success | 296994
+ pdf | watermark.silverchair.com | success | 275562
+ pdf | www.nature.com | success | 263836
+ pdf | cds.cern.ch | success | 223057
+ pdf | www.pnas.org | success | 220488
+ pdf | s3-eu-west-1.amazonaws.com | success | 214558
+ pdf | www.jbc.org | success | 205277
+ pdf | www.redalyc.org | success | 193591
+ pdf | iopscience.iop.org | success | 175796
+ pdf | apps.dtic.mil | success | 170589
+ pdf | zenodo.org | success | 167812
+ pdf | peerj.com | success | 155620
+ pdf | www.biorxiv.org | success | 149337
+ pdf | 210.101.116.28 | success | 145706
+ pdf | www.teses.usp.br | success | 145438
+ pdf | absimage.aps.org | success | 144400
+ pdf | hrcak.srce.hr | success | 134669
+ pdf | www.erudit.org | success | 131771
+ pdf | babel.hathitrust.org | success | 130645
+ (30 rows)
+
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ => NOT RUN, fatcat_file table is way out of date
+
diff --git a/sql/stats/2020-07-23_stats.txt b/sql/stats/2020-07-23_stats.txt
new file mode 100644
index 0000000..d1993fc
--- /dev/null
+++ b/sql/stats/2020-07-23_stats.txt
@@ -0,0 +1,347 @@
+
+Summary:
+
+- very many more PDFs have been grobid-ed vs. pdf_meta-ed
+- about 1 million file_meta still have partial metadata (eg, no sha256)
+- database size still under 0.5 TByte
+- there are about a million CDX error ingest requests, and hundreds of
+ thousands of SPN errors which could be re-run
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 42 GB | 84 GB
+ "public"."ingest_request" | 34 GB | 39 GB | 73 GB
+ "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB
+ "public"."grobid" | 61 GB | 7742 MB | 69 GB
+ "public"."file_meta" | 32 GB | 29 GB | 61 GB
+ "public"."ingest_file_result" | 24 GB | 36 GB | 60 GB
+ "public"."shadow" | 9111 MB | 10204 MB | 19 GB
+ "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB
+ "public"."pdf_meta" | 8018 MB | 1966 MB | 9984 MB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (11 rows)
+
+ Size: 466.91G
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 161944425 | 204,402,677,360,189
+ (1 row)
+
+ # 161.9 mil; 204 TByte
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 161691608
+ application/octet-stream | 154348
+ application/xml | 42170
+ text/html | 18703
+ text/plain | 15989
+ application/gzip | 6484
+ | 6036
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ (10 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ ---------
+ 1015337
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 96537611 | 116281981
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;
+
+ mimetype | count
+ ---------------------------------------------------+-----------
+ application/pdf | 108706978
+ warc/revisit | 5912013
+ text/xml | 519042
+ application/octet-stream | 307782
+ text/html | 295634
+ unk | 156937
+ application/postscript | 81079
+ application/save | 80871
+ binary/octet-stream | 61263
+ text/plain | 31495
+ application/x-download | 30511
+ application/download | 26716
+ image/pdf | 26357
+ application/force-download | 10541
+ multipart/form-data | 5551
+ application/x-msdownload | 3724
+ application/x-octetstream | 3216
+ application | 3171
+ .pdf | 2728
+ application/x-pdf | 2563
+ application/binary | 1306
+ application/pdf' | 1192
+ pdf | 1180
+ [...]
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+
+
+ total_files | unique_releases
+ -------------+-----------------
+ 95557413 | 18020570
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+
+ status_code | count
+ -------------+----------
+ 200 | 88450610
+ 500 | 7101098
+ -4 | 4133
+ 503 | 110
+
+ SELECT status, COUNT(*) FROM grobid GROUP BY ORDER BY COUNT DESC LIMIT 10;
+
+ status | count
+ ----------------+----------
+ success | 73814297
+ | 14638412
+ error | 7101308
+ error-timeout | 4133
+ bad-grobid-xml | 6
+ (5 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 73813427
+ | 14638425
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | mag | 35015357
+ pdf | unpaywall | 27653003
+ pdf | doi | 16589669
+ pdf | pmc | 2231113
+ pdf | arxiv | 794693
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 148
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | mag | mag-corpus | 35015357
+ pdf | unpaywall | unpaywall | 27653003
+ pdf | doi | fatcat-ingest | 8320832
+ pdf | doi | fatcat-changelog | 4752956
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 630750
+ pdf | pmc | fatcat-ingest | 194781
+ pdf | arxiv | fatcat-changelog | 163924
+ pdf | pmc | fatcat-changelog | 7507
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 148
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 19
+ pdf | arxiv | savepapernow-web | 2
+
+Uncrawled requests by source:
+
+ # TODO: verify this? seems wrong
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | mag | 4097008
+ pdf | oai | 15287
+ pdf | unpaywall | 1
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 5346057 | 0.104
+ pdf | mag | 35015357 | 22199271 | 0.634
+ pdf | unpaywall | 27653003 | 22067338 | 0.798
+ pdf | doi | 16589700 | 3207661 | 0.193
+ pdf | pmc | 2231113 | 1696976 | 0.761
+ pdf | arxiv | 794727 | 645607 | 0.812
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 148 | 114 | 0.770
+ (9 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+----------
+ pdf | success | 46465271
+ pdf | no-capture | 46115869
+ pdf | no-pdf-link | 13877460
+ pdf | redirect-loop | 5943956
+ pdf | terminal-bad-status | 1962754
+ pdf | link-loop | 1630078
+ pdf | cdx-error | 1014409
+ pdf | gateway-timeout | 459340
+ pdf | wrong-mimetype | 321774
+ pdf | skip-url-blocklist | 220629
+ pdf | wayback-error | 220453
+ pdf | spn2-cdx-lookup-failure | 143963
+ pdf | null-body | 113384
+ pdf | spn-error | 101773
+ pdf | invalid-host-resolution | 37367
+ pdf | spn-remote-error | 28886
+ pdf | petabox-error | 22997
+ pdf | spn2-error | 16342
+ pdf | spn2-error:job-failed | 5017
+ pdf | other-mimetype | 2305
+ pdf | redirects-exceeded | 746
+ pdf | spn2-error:soft-time-limit-exceeded | 632
+ pdf | spn2-error:proxy-error | 437
+ pdf | spn2-error:invalid-url-syntax | 417
+ pdf | timeout | 417
+ (25 rows)
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+
+ total_count | release_count
+ -------------+---------------
+ 5862666 | 4728824
+ (1 row)
+
+## PDF Meta
+
+Total rows:
+
+ SELECT COUNT(*) as total_count FROM pdf_meta;
+
+
+ total_count
+ -------------
+ 21961874
+
+By status:
+
+ SELECT status, COUNT(*) from pdf_meta GROUP BY status ORDER BY COUNT(*) DESC;
+
+ status | count
+ ----------------+----------
+ success | 21788507
+ parse-error | 78196
+ text-too-large | 60595
+ not-pdf | 31679
+ error-wayback | 2639
+ bad-unicode | 251
+ bad-pdf | 6
+ empty-blob | 1
+ (8 rows)
+
diff --git a/sql/stats/2020-09-14_stats.txt b/sql/stats/2020-09-14_stats.txt
new file mode 100644
index 0000000..3bc27b0
--- /dev/null
+++ b/sql/stats/2020-09-14_stats.txt
@@ -0,0 +1,340 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 44 GB | 45 GB | 89 GB
+ "public"."grobid" | 66 GB | 8127 MB | 74 GB
+ "public"."ingest_request" | 34 GB | 40 GB | 73 GB
+ "public"."ingest_file_result" | 28 GB | 44 GB | 72 GB
+ "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB
+ "public"."file_meta" | 33 GB | 30 GB | 63 GB
+ "public"."shadow" | 9111 MB | 10204 MB | 19 GB
+ "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB
+ "public"."pdf_meta" | 12 GB | 2924 MB | 15 GB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (11 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 167021210 | 221982345333674
+ (1 row)
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 166765214
+ application/octet-stream | 155517
+ application/xml | 42170
+ text/html | 18708
+ text/plain | 15990
+ application/gzip | 6491
+ | 6036
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ (10 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 62960
+ (1 row)
+
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 102123051 | 126550160
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 116885565
+ warc/revisit | 7951816
+ text/xml | 519042
+ application/octet-stream | 327639
+ text/html | 295725
+ unk | 172491
+ application/postscript | 81095
+ application/save | 80900
+ binary/octet-stream | 61783
+ text/plain | 33684
+ image/pdf | 32856
+ application/x-download | 32418
+ application/download | 27672
+ application/force-download | 10892
+ multipart/form-data | 5750
+ application/x-msdownload | 3832
+ application/x-octetstream | 3516
+ application | 3499
+ .pdf | 3038
+ application/x-pdf | 2701
+ application/binary | 1322
+ pdf | 1232
+ file/unknown | 1199
+ application/pdf' | 1192
+ file | 979
+ (25 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+
+ total_files | unique_releases
+ -------------+-----------------
+ 101494314 | 18919012
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+ status_code | count
+ -------------+----------
+ 200 | 93730358
+ 500 | 7759103
+ -4 | 4683
+ 503 | 150
+ (4 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 80838234
+ | 12892145
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | mag | 35015357
+ pdf | unpaywall | 27653003
+ pdf | doi | 17362763
+ pdf | pmc | 2248854
+ pdf | arxiv | 835400
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 197
+ (9 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | mag | mag-corpus | 35015357
+ pdf | unpaywall | unpaywall | 27653003
+ pdf | doi | fatcat-ingest | 8399261
+ pdf | doi | fatcat-changelog | 5449349
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 634665
+ pdf | pmc | fatcat-ingest | 210453
+ pdf | arxiv | fatcat-changelog | 200707
+ pdf | pmc | fatcat-changelog | 9582
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 197
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 21
+ pdf | arxiv | savepapernow-web | 2
+ (17 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 170304
+ pdf | oai | 15287
+ pdf | unpaywall | 1
+ (3 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 14144314 | 0.276
+ pdf | mag | 35015357 | 24811947 | 0.709
+ pdf | unpaywall | 27653003 | 22302629 | 0.807
+ pdf | doi | 17363369 | 3533568 | 0.204
+ pdf | pmc | 2248860 | 1713197 | 0.762
+ pdf | arxiv | 835400 | 685219 | 0.820
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 197 | 138 | 0.701
+ (9 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+----------
+ pdf | success | 58265365
+ pdf | no-pdf-link | 27216435
+ pdf | no-capture | 21982611
+ pdf | redirect-loop | 8457469
+ pdf | terminal-bad-status | 2695023
+ pdf | link-loop | 2209672
+ pdf | wrong-mimetype | 767508
+ pdf | gateway-timeout | 548870
+ pdf | cdx-error | 391611
+ pdf | skip-url-blocklist | 220661
+ pdf | null-body | 182215
+ pdf | wayback-error | 146869
+ pdf | spn2-cdx-lookup-failure | 107229
+ pdf | spn-error | 85128
+ pdf | invalid-host-resolution | 37352
+ pdf | petabox-error | 32490
+ pdf | spn2-error | 29212
+ pdf | spn-remote-error | 27927
+ pdf | other-mimetype | 2305
+ pdf | bad-redirect | 1524
+ pdf | spn2-error:job-failed | 1521
+ pdf | timeout | 842
+ pdf | spn2-error:soft-time-limit-exceeded | 793
+ pdf | redirects-exceeded | 748
+ pdf | spn2-error:invalid-url-syntax | 417
+ (25 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*)
+ FROM ingest_file_result
+ WHERE hit = false
+ GROUP BY ingest_type, terminal_status_code
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 34064937
+ pdf | | 20514531
+ pdf | 301 | 7271700
+ pdf | 302 | 720632
+ pdf | 503 | 712697
+ pdf | 400 | 444209
+ pdf | 404 | 331495
+ pdf | 403 | 323030
+ pdf | 401 | 259327
+ pdf | 500 | 236122
+ pdf | 303 | 101609
+ pdf | 429 | 47738
+ pdf | 502 | 36183
+ pdf | 420 | 26603
+ pdf | 509 | 15113
+ pdf | 409 | 14790
+ pdf | 999 | 8996
+ pdf | 307 | 3769
+ pdf | 308 | 3422
+ pdf | 202 | 3228
+ pdf | 520 | 2058
+ pdf | 410 | 1734
+ pdf | 521 | 1033
+ pdf | 504 | 868
+ pdf | 505 | 424
+ (25 rows)
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ total_count | release_count
+ -------------+---------------
+ 6600758 | 5213294
+ (1 row)
+
diff --git a/sql/stats/2021-04-07_stats.txt b/sql/stats/2021-04-07_stats.txt
new file mode 100644
index 0000000..fca76b9
--- /dev/null
+++ b/sql/stats/2021-04-07_stats.txt
@@ -0,0 +1,430 @@
+
+## SQL Table Sizes
+
+ Size: 551.34G
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 49 GB | 50 GB | 100 GB
+ "public"."ingest_file_result" | 33 GB | 52 GB | 85 GB
+ "public"."ingest_request" | 39 GB | 45 GB | 83 GB
+ "public"."grobid" | 70 GB | 8613 MB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 7208 MB | 74 GB
+ "public"."file_meta" | 35 GB | 31 GB | 66 GB
+ "public"."pdf_meta" | 19 GB | 4925 MB | 24 GB
+ "public"."shadow" | 9517 MB | 10 GB | 20 GB
+ "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB
+ "public"."html_meta" | 1172 MB | 10 MB | 1182 MB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (12 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 174200807 | 234313766162033
+ (1 row)
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 173816433
+ application/octet-stream | 155534
+ text/html | 115821
+ application/xml | 42170
+ application/xhtml+xml | 24347
+ text/plain | 15990
+ application/jats+xml | 6899
+ application/gzip | 6491
+ | 6034
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ application/x-bzip2 | 891
+ image/jpeg | 721
+ image/gif | 389
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 297
+ application/x-compress | 272
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ image/png | 88
+ application/mac-binhex40 | 79
+ application/x-dosexec | 51
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ text/rtf | 33
+ application/x-dvi | 29
+ application/x-rar | 29
+ application/vnd.ms-excel | 28
+ message/rfc822 | 26
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 62271
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 113880640 | 141793694
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 131346703
+ warc/revisit | 8394443
+ text/xml | 525481
+ application/octet-stream | 502400
+ text/html | 417579
+ unk | 186703
+ application/postscript | 81095
+ application/save | 80915
+ binary/octet-stream | 66698
+ application/x-download | 35771
+ text/plain | 35606
+ image/pdf | 33904
+ application/download | 29701
+ application/force-download | 16726
+ multipart/form-data | 6878
+ application/x-msdownload | 3843
+ application | 3724
+ application/x-octetstream | 3550
+ .pdf | 3138
+ application/x-pdf | 2780
+ application/binary | 1332
+ pdf | 1247
+ file/unknown | 1200
+ application/pdf' | 1192
+ file | 1108
+ application/unknown | 978
+ application/octetstream | 856
+ application/blob | 673
+ text/pdf | 672
+ 0 | 546
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+
+ total_files | unique_releases
+ -------------+-----------------
+ 105594307 | 19594878
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+----------
+ 200 | 97714631
+ 500 | 7875192
+ -4 | 4772
+ 503 | 520
+ (4 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 84822508
+ | 12892147
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | mag | 35015357
+ pdf | unpaywall | 31772942
+ pdf | doi | 23528817
+ pdf | doaj | 4264610
+ html | doaj | 2429003
+ pdf | pmc | 2277417
+ pdf | arxiv | 2143549
+ xml | doaj | 9442
+ html | doi | 3022
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 469
+ html | spn | 9
+ (14 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | mag | mag-corpus | 35015357
+ pdf | unpaywall | unpaywall | 31772942
+ pdf | doi | fatcat-changelog | 11010764
+ pdf | doi | fatcat-ingest | 9002119
+ pdf | doaj | doaj | 4264610
+ pdf | doi | fatcat-ingest-container | 3515873
+ html | doaj | doaj | 2429003
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1767703
+ pdf | arxiv | fatcat-changelog | 375818
+ pdf | pmc | fatcat-ingest | 211264
+ pdf | pmc | fatcat-changelog | 37328
+ xml | doaj | doaj | 9442
+ html | doi | fatcat-ingest | 3018
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 469
+ pdf | doi | savepapernow-web | 74
+ pdf | arxiv | fatcat-ingest-container | 26
+ html | spn | savepapernow-web | 9
+ html | doi | savepapernow-web | 4
+ pdf | arxiv | savepapernow-web | 2
+ (23 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 168462
+ pdf | oai | 15286
+ pdf | doaj | 2068
+ html | doaj | 620
+ pdf | unpaywall | 13
+ (5 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 14163500 | 0.277
+ pdf | mag | 35015357 | 24818176 | 0.709
+ pdf | unpaywall | 31772942 | 25018501 | 0.787
+ pdf | doi | 23529041 | 5773728 | 0.245
+ pdf | doaj | 4264610 | 2851328 | 0.669
+ html | doaj | 2429003 | 122937 | 0.051
+ pdf | pmc | 2277417 | 1736491 | 0.762
+ pdf | arxiv | 2143549 | 2011378 | 0.938
+ xml | doaj | 9442 | 6897 | 0.730
+ html | doi | 3022 | 957 | 0.317
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 469 | 328 | 0.699
+ html | spn | 9 | 2 | 0.222
+ (14 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+--------------------------------+----------
+ pdf | success | 66487928
+ pdf | no-pdf-link | 29279677
+ pdf | no-capture | 22765431
+ pdf | redirect-loop | 9155767
+ pdf | terminal-bad-status | 3549665
+ pdf | link-loop | 2592983
+ html | wrong-scope | 1088793
+ pdf | wrong-mimetype | 792563
+ pdf | gateway-timeout | 478181
+ html | no-capture | 423917
+ pdf | wayback-content-error | 355828
+ pdf | cdx-error | 343862
+ pdf | null-body | 328774
+ pdf | forbidden | 286647
+ pdf | spn2-cdx-lookup-failure | 276769
+ pdf | spn2-wayback-error | 276080
+ pdf | skip-url-blocklist | 265473
+ html | redirect-loop | 212916
+ pdf | not-found | 204367
+ html | unknown-scope | 204112
+ html | html-resource-no-capture | 166034
+ pdf | blocked-cookie | 160336
+ pdf | too-many-redirects | 152984
+ html | success | 123896
+ pdf | wayback-error | 114388
+ html | null-body | 100296
+ pdf | spn2-error:too-many-redirects | 58336
+ html | wayback-content-error | 53926
+ pdf | invalid-host-resolution | 37226
+ pdf | petabox-error | 37177
+ pdf | remote-server-error | 36439
+ pdf | spn2-error | 27556
+ pdf | spn2-error:proxy-error | 25486
+ pdf | read-timeout | 20745
+ html | wrong-mimetype | 18928
+ html | terminal-bad-status | 14059
+ html | petabox-error | 13533
+ pdf | bad-redirect | 7535
+ xml | success | 6897
+ html | cdx-error | 6823
+ pdf | spn2-error:bad-request | 4664
+ pdf | spn2-error:unauthorized | 4391
+ pdf | spn-remote-error | 4206
+ pdf | spn2-error:service-unavailable | 2614
+ pdf | spn2-error:job-failed | 2562
+ xml | null-body | 2353
+ pdf | other-mimetype | 2304
+ pdf | error | 1905
+ html | spn2-cdx-lookup-failure | 1018
+ pdf | redirects-exceeded | 1015
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 36515867
+ pdf | | 22909334
+ pdf | 301 | 7969702
+ html | 200 | 1653303
+ pdf | 503 | 928507
+ pdf | 403 | 823755
+ pdf | 302 | 792842
+ pdf | 400 | 462108
+ html | | 426474
+ pdf | 404 | 422163
+ pdf | 401 | 270611
+ pdf | 500 | 248675
+ html | 301 | 211713
+ pdf | 303 | 109686
+ pdf | 410 | 50648
+ pdf | 502 | 37663
+ pdf | 429 | 31982
+ pdf | 420 | 26603
+ pdf | 509 | 15113
+ pdf | 409 | 14835
+ html | 404 | 9573
+ pdf | 999 | 9296
+ pdf | 307 | 3972
+ pdf | 308 | 3914
+ html | 500 | 3625
+ pdf | 202 | 3515
+ xml | 200 | 2537
+ pdf | 520 | 2072
+ pdf | 206 | 1665
+ pdf | 521 | 1075
+ html | 302 | 1072
+ pdf | 504 | 1000
+ pdf | 412 | 476
+ pdf | 300 | 434
+ pdf | 505 | 429
+ pdf | 406 | 393
+ html | 403 | 382
+ html | 503 | 378
+ pdf | 421 | 298
+ html | 303 | 268
+ pdf | 508 | 195
+ pdf | 226 | 166
+ pdf | 402 | 70
+ html | 502 | 68
+ pdf | 408 | 50
+ pdf | 204 | 34
+ pdf | 416 | 29
+ pdf | 501 | 29
+ pdf | 530 | 27
+ pdf | 507 | 21
+ (50 rows)
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ total_count | release_count
+ -------------+---------------
+ 8514315 | 6401104
+ (1 row)
diff --git a/sql/stats/2021-04-08_table_sizes.txt b/sql/stats/2021-04-08_table_sizes.txt
new file mode 100644
index 0000000..a8a9cd5
--- /dev/null
+++ b/sql/stats/2021-04-08_table_sizes.txt
@@ -0,0 +1,40 @@
+
+## SQL Table Sizes
+
+ Size: 467.23G
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 49 GB | 26 GB | 76 GB
+ "public"."grobid" | 69 GB | 6834 MB | 75 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."ingest_request" | 39 GB | 32 GB | 70 GB
+ "public"."ingest_file_result" | 32 GB | 29 GB | 60 GB
+ "public"."file_meta" | 32 GB | 21 GB | 53 GB
+ "public"."pdf_meta" | 18 GB | 3733 MB | 22 GB
+ "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 1196 MB | 8072 kB | 1204 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ (12 rows)
+
diff --git a/sql/stats/2021-04-12_ingest_domain_summary_30d.txt b/sql/stats/2021-04-12_ingest_domain_summary_30d.txt
new file mode 100644
index 0000000..6811b54
--- /dev/null
+++ b/sql/stats/2021-04-12_ingest_domain_summary_30d.txt
@@ -0,0 +1,345 @@
+ domain | status | count
+---------------------------------------+-------------------------+--------
+ academic.oup.com | | 4105
+ academic.oup.com | spn2-wayback-error | 1393
+ academic.oup.com | link-loop | 1025
+ academic.oup.com | no-pdf-link | 1020
+ academic.oup.com | spn2-cdx-lookup-failure | 512
+ acervus.unicamp.br | | 1967
+ acervus.unicamp.br | no-pdf-link | 1853
+ acp.copernicus.org | | 620
+ acp.copernicus.org | success | 537
+ aip.scitation.org | | 1310
+ aip.scitation.org | blocked-cookie | 1192
+ alustath.uobaghdad.edu.iq | | 697
+ alustath.uobaghdad.edu.iq | success | 550
+ apex.ipk-gatersleben.de | | 1253
+ apex.ipk-gatersleben.de | no-pdf-link | 1132
+ apps.crossref.org | | 4693
+ apps.crossref.org | no-pdf-link | 4075
+ arxiv.org | | 14990
+ arxiv.org | success | 12899
+ arxiv.org | spn2-wayback-error | 1592
+ ashpublications.org | | 563
+ asmedigitalcollection.asme.org | | 3990
+ asmedigitalcollection.asme.org | spn2-cdx-lookup-failure | 1570
+ asmedigitalcollection.asme.org | no-pdf-link | 1449
+ asmedigitalcollection.asme.org | link-loop | 734
+ assets.researchsquare.com | | 8217
+ assets.researchsquare.com | success | 7116
+ assets.researchsquare.com | spn2-wayback-error | 946
+ av.tib.eu | | 526
+ bioone.org | | 588
+ books.openedition.org | | 1784
+ books.openedition.org | no-pdf-link | 1466
+ boris.unibe.ch | | 1420
+ boris.unibe.ch | success | 743
+ brill.com | | 1773
+ brill.com | link-loop | 879
+ chemrxiv.org | | 857
+ chemrxiv.org | no-pdf-link | 519
+ classiques-garnier.com | | 1072
+ classiques-garnier.com | success | 807
+ content.iospress.com | | 793
+ content.iospress.com | link-loop | 568
+ cyberdoi.ru | | 775
+ cyberdoi.ru | redirect-loop | 775
+ cyberleninka.ru | | 1453
+ cyberleninka.ru | success | 1092
+ d197for5662m48.cloudfront.net | | 632
+ d197for5662m48.cloudfront.net | success | 544
+ dergipark.org.tr | | 3070
+ dergipark.org.tr | success | 1251
+ dergipark.org.tr | no-pdf-link | 843
+ dergipark.org.tr | spn2-wayback-error | 677
+ digi.ub.uni-heidelberg.de | | 502
+ dione.lib.unipi.gr | | 783
+ direct.mit.edu | | 996
+ direct.mit.edu | no-pdf-link | 869
+ dl.acm.org | | 1692
+ dl.acm.org | blocked-cookie | 1558
+ dlc.library.columbia.edu | | 4225
+ dlc.library.columbia.edu | no-pdf-link | 2395
+ dlc.library.columbia.edu | spn2-wayback-error | 1568
+ doi.ala.org.au | | 2570
+ doi.ala.org.au | no-pdf-link | 2153
+ doi.nrct.go.th | | 566
+ doi.org | | 10408
+ doi.org | spn2-cdx-lookup-failure | 9593
+ doi.org | terminal-bad-status | 741
+ downloads.hindawi.com | | 2137
+ downloads.hindawi.com | success | 1787
+ dram.journals.ekb.eg | | 541
+ elib.spbstu.ru | | 1243
+ elib.spbstu.ru | redirect-loop | 1214
+ elibrary.vdi-verlag.de | | 1542
+ elibrary.vdi-verlag.de | spn2-wayback-error | 721
+ elifesciences.org | | 689
+ elifesciences.org | success | 521
+ epos.myesr.org | | 705
+ epos.myesr.org | spn2-wayback-error | 604
+ europepmc.org | | 6996
+ europepmc.org | success | 6031
+ europepmc.org | spn2-wayback-error | 756
+ figshare.com | | 1168
+ figshare.com | no-pdf-link | 726
+ files.osf.io | | 1526
+ files.osf.io | success | 1078
+ fjfsdata01prod.blob.core.windows.net | | 5410
+ fjfsdata01prod.blob.core.windows.net | success | 4581
+ fjfsdata01prod.blob.core.windows.net | spn2-wayback-error | 587
+ fldeploc.dep.state.fl.us | | 774
+ fldeploc.dep.state.fl.us | no-pdf-link | 718
+ geoscan.nrcan.gc.ca | | 2056
+ geoscan.nrcan.gc.ca | no-pdf-link | 2019
+ hcommons.org | | 1593
+ hcommons.org | success | 1333
+ hkvalidate.perfdrive.com | | 1322
+ hkvalidate.perfdrive.com | no-pdf-link | 1083
+ ieeexplore.ieee.org | | 20997
+ ieeexplore.ieee.org | too-many-redirects | 15383
+ ieeexplore.ieee.org | spn2-wayback-error | 2555
+ ieeexplore.ieee.org | success | 2165
+ ieeexplore.ieee.org | spn2-cdx-lookup-failure | 747
+ jamanetwork.com | | 712
+ journals.aps.org | | 1698
+ journals.aps.org | not-found | 1469
+ journals.library.ualberta.ca | | 733
+ journals.library.ualberta.ca | success | 594
+ journals.lww.com | | 6606
+ journals.lww.com | link-loop | 3102
+ journals.lww.com | spn2-wayback-error | 1645
+ journals.lww.com | terminal-bad-status | 965
+ journals.lww.com | spn2-cdx-lookup-failure | 552
+ journals.openedition.org | | 4594
+ journals.openedition.org | success | 1441
+ journals.openedition.org | redirect-loop | 1316
+ journals.openedition.org | spn2-wayback-error | 1197
+ journals.ub.uni-heidelberg.de | | 1039
+ journals.ub.uni-heidelberg.de | success | 728
+ kiss.kstudy.com | | 747
+ kiss.kstudy.com | no-pdf-link | 686
+ library.iated.org | | 1560
+ library.iated.org | redirect-loop | 1148
+ linkinghub.elsevier.com | | 5079
+ linkinghub.elsevier.com | forbidden | 2226
+ linkinghub.elsevier.com | spn2-wayback-error | 1625
+ linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758
+ mr.crossref.org | | 542
+ nsuworks.nova.edu | | 843
+ nsuworks.nova.edu | success | 746
+ ojs.cvut.cz | | 805
+ ojs.cvut.cz | success | 764
+ ojs.ugent.be | | 867
+ ojs.ugent.be | success | 643
+ onepetro.org | | 603
+ onlinelibrary.wiley.com | | 1203
+ onlinelibrary.wiley.com | blocked-cookie | 758
+ open.library.ubc.ca | | 559
+ osf.io | | 3139
+ osf.io | not-found | 2288
+ osf.io | spn2-wayback-error | 582
+ oxford.universitypressscholarship.com | | 3556
+ oxford.universitypressscholarship.com | link-loop | 2373
+ oxford.universitypressscholarship.com | spn2-wayback-error | 562
+ painphysicianjournal.com | | 804
+ painphysicianjournal.com | success | 668
+ papers.ssrn.com | | 6367
+ papers.ssrn.com | link-loop | 3865
+ papers.ssrn.com | spn2-wayback-error | 1106
+ papers.ssrn.com | spn2-cdx-lookup-failure | 1015
+ peerj.com | | 785
+ peerj.com | no-pdf-link | 552
+ pos.sissa.it | | 1455
+ pos.sissa.it | success | 1153
+ preprints.jmir.org | | 763
+ preprints.jmir.org | no-pdf-link | 611
+ psyarxiv.com | | 641
+ psyarxiv.com | no-pdf-link | 546
+ publikationen.uni-tuebingen.de | | 659
+ publons.com | | 6998
+ publons.com | no-pdf-link | 6982
+ pubs.acs.org | | 5860
+ pubs.acs.org | blocked-cookie | 5185
+ pubs.rsc.org | | 2269
+ pubs.rsc.org | link-loop | 1384
+ res.mdpi.com | | 15776
+ res.mdpi.com | success | 13710
+ res.mdpi.com | spn2-wayback-error | 1424
+ res.mdpi.com | spn2-cdx-lookup-failure | 641
+ rrs.scholasticahq.com | | 1078
+ rrs.scholasticahq.com | success | 803
+ rsdjournal.org | | 755
+ rsdjournal.org | success | 524
+ s3-eu-west-1.amazonaws.com | | 3343
+ s3-eu-west-1.amazonaws.com | success | 2893
+ saemobilus.sae.org | | 795
+ saemobilus.sae.org | no-pdf-link | 669
+ sage.figshare.com | | 725
+ scholar.dkyobobook.co.kr | | 1043
+ scholar.dkyobobook.co.kr | no-pdf-link | 915
+ scholarworks.umass.edu | | 1196
+ scholarworks.umass.edu | success | 713
+ secure.jbs.elsevierhealth.com | | 4202
+ secure.jbs.elsevierhealth.com | blocked-cookie | 4169
+ storage.googleapis.com | | 1720
+ storage.googleapis.com | success | 1466
+ tandf.figshare.com | | 789
+ tandf.figshare.com | no-pdf-link | 640
+ tind-customer-agecon.s3.amazonaws.com | | 584
+ turcomat.org | | 1196
+ turcomat.org | spn2-wayback-error | 997
+ unreserved.rba.gov.au | | 823
+ unreserved.rba.gov.au | no-pdf-link | 821
+ utpjournals.press | | 669
+ utpjournals.press | blocked-cookie | 616
+ watermark.silverchair.com | | 3560
+ watermark.silverchair.com | success | 2788
+ watermark.silverchair.com | spn2-wayback-error | 685
+ wayf.switch.ch | | 1169
+ wayf.switch.ch | no-pdf-link | 809
+ www.ahajournals.org | | 802
+ www.ahajournals.org | blocked-cookie | 597
+ www.ajol.info | | 830
+ www.ajol.info | success | 575
+ www.ams.org | | 868
+ www.ams.org | terminal-bad-status | 666
+ www.atlantis-press.com | | 1579
+ www.atlantis-press.com | success | 1071
+ www.bloomsburycollections.com | | 1745
+ www.bloomsburycollections.com | no-pdf-link | 1571
+ www.brazilianjournals.com | | 1385
+ www.brazilianjournals.com | success | 1107
+ www.cairn.info | | 2479
+ www.cairn.info | no-pdf-link | 818
+ www.cairn.info | link-loop | 790
+ www.cambridge.org | | 6801
+ www.cambridge.org | no-pdf-link | 2990
+ www.cambridge.org | spn2-wayback-error | 1475
+ www.cambridge.org | link-loop | 940
+ www.cambridge.org | success | 863
+ www.cureus.com | | 538
+ www.dbpia.co.kr | | 2958
+ www.dbpia.co.kr | redirect-loop | 2953
+ www.degruyter.com | | 58612
+ www.degruyter.com | no-pdf-link | 41065
+ www.degruyter.com | spn2-wayback-error | 7426
+ www.degruyter.com | success | 6628
+ www.degruyter.com | spn2-cdx-lookup-failure | 1624
+ www.degruyter.com | terminal-bad-status | 1565
+ www.dovepress.com | | 869
+ www.dovepress.com | success | 597
+ www.e-manuscripta.ch | | 1047
+ www.e3s-conferences.org | | 817
+ www.e3s-conferences.org | success | 606
+ www.elgaronline.com | | 535
+ www.elibrary.ru | | 1244
+ www.elibrary.ru | no-pdf-link | 1159
+ www.emc2020.eu | | 791
+ www.emc2020.eu | no-pdf-link | 748
+ www.emerald.com | | 2420
+ www.emerald.com | no-pdf-link | 1986
+ www.eurekaselect.com | | 540
+ www.eurosurveillance.org | | 786
+ www.eurosurveillance.org | success | 710
+ www.finersistemas.com | | 1220
+ www.finersistemas.com | success | 1214
+ www.frontiersin.org | | 915
+ www.frontiersin.org | spn2-wayback-error | 602
+ www.hanspub.org | | 618
+ www.humankineticslibrary.com | | 1122
+ www.humankineticslibrary.com | no-pdf-link | 985
+ www.ijcmas.com | | 513
+ www.inderscience.com | | 1532
+ www.inderscience.com | no-pdf-link | 1217
+ www.indianjournals.com | | 904
+ www.ingentaconnect.com | | 885
+ www.ingentaconnect.com | no-pdf-link | 783
+ www.journals.uchicago.edu | | 6055
+ www.journals.uchicago.edu | blocked-cookie | 5927
+ www.journals.vu.lt | | 791
+ www.journals.vu.lt | success | 545
+ www.jstage.jst.go.jp | | 1490
+ www.jstage.jst.go.jp | remote-server-error | 1023
+ www.jstor.org | | 1103
+ www.jstor.org | redirect-loop | 553
+ www.karger.com | | 733
+ www.liebertpub.com | | 804
+ www.liebertpub.com | blocked-cookie | 714
+ www.liverpooluniversitypress.co.uk | | 620
+ www.liverpooluniversitypress.co.uk | too-many-redirects | 529
+ www.mdpi.com | | 3880
+ www.mdpi.com | spn2-wayback-error | 1651
+ www.mdpi.com | forbidden | 1282
+ www.mdpi.com | spn2-cdx-lookup-failure | 714
+ www.nepjol.info | | 596
+ www.nomos-elibrary.de | | 2235
+ www.nomos-elibrary.de | no-pdf-link | 1128
+ www.nomos-elibrary.de | spn2-wayback-error | 559
+ www.oecd-ilibrary.org | | 3046
+ www.oecd-ilibrary.org | no-pdf-link | 2869
+ www.osapublishing.org | | 821
+ www.osapublishing.org | no-pdf-link | 615
+ www.osti.gov | | 1147
+ www.osti.gov | link-loop | 902
+ www.oxfordscholarlyeditions.com | | 759
+ www.oxfordscholarlyeditions.com | no-pdf-link | 719
+ www.preprints.org | | 783
+ www.preprints.org | success | 595
+ www.repository.cam.ac.uk | | 1146
+ www.research-collection.ethz.ch | | 704
+ www.research-collection.ethz.ch | terminal-bad-status | 684
+ www.researchsquare.com | | 853
+ www.researchsquare.com | spn2-wayback-error | 515
+ www.schweizerbart.de | | 730
+ www.schweizerbart.de | no-pdf-link | 653
+ www.scielo.br | | 1777
+ www.scielo.br | success | 1167
+ www.sciencedirect.com | | 14757
+ www.sciencedirect.com | no-pdf-link | 12733
+ www.sciencedirect.com | spn2-wayback-error | 1503
+ www.sciendo.com | | 1955
+ www.sciendo.com | no-pdf-link | 1176
+ www.scilook.eu | | 812
+ www.scilook.eu | success | 563
+ www.scirp.org | | 749
+ www.tandfonline.com | | 11038
+ www.tandfonline.com | blocked-cookie | 9994
+ www.tandfonline.com | no-pdf-link | 663
+ www.taylorfrancis.com | | 71514
+ www.taylorfrancis.com | spn2-wayback-error | 36663
+ www.taylorfrancis.com | no-pdf-link | 15098
+ www.taylorfrancis.com | forbidden | 8699
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 6894
+ www.taylorfrancis.com | link-loop | 3661
+ www.thieme-connect.de | | 3687
+ www.thieme-connect.de | redirect-loop | 1187
+ www.thieme-connect.de | not-found | 945
+ www.thieme-connect.de | no-pdf-link | 941
+ www.worldscientific.com | | 1476
+ www.worldscientific.com | blocked-cookie | 1323
+ www.zora.uzh.ch | | 1118
+ zenodo.org | | 43010
+ zenodo.org | no-pdf-link | 22015
+ zenodo.org | success | 12747
+ zenodo.org | spn2-wayback-error | 4608
+ zenodo.org | spn2-cdx-lookup-failure | 3215
+ | | 725990
+ | no-pdf-link | 209933
+ | success | 206134
+ | spn2-wayback-error | 127015
+ | spn2-cdx-lookup-failure | 53384
+ | blocked-cookie | 35867
+ | link-loop | 25834
+ | too-many-redirects | 16430
+ | redirect-loop | 14648
+ | forbidden | 13794
+ | terminal-bad-status | 8055
+ | not-found | 6399
+ | remote-server-error | 2402
+ | wrong-mimetype | 2011
+ | spn2-error:unauthorized | 912
+ | bad-redirect | 555
+ | read-timeout | 530
+(341 rows)
+
diff --git a/sql/stats/2021-11-01_table_sizes.txt b/sql/stats/2021-11-01_table_sizes.txt
new file mode 100644
index 0000000..57f7e57
--- /dev/null
+++ b/sql/stats/2021-11-01_table_sizes.txt
@@ -0,0 +1,19 @@
+
+Size: 832.66G
+
+ table_name | table_size | indexes_size | total_size
+-------------------------------+------------+--------------+------------
+ "public"."crossref" | 311 GB | 9812 MB | 320 GB
+ "public"."ingest_request" | 44 GB | 40 GB | 84 GB
+ "public"."cdx" | 52 GB | 28 GB | 80 GB
+ "public"."grobid" | 72 GB | 6952 MB | 79 GB
+ "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."file_meta" | 34 GB | 21 GB | 54 GB
+ "public"."pdf_meta" | 20 GB | 5813 MB | 26 GB
+ "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+(13 rows)
diff --git a/sql/stats/2021-11-26_stats.txt b/sql/stats/2021-11-26_stats.txt
new file mode 100644
index 0000000..3a0e561
--- /dev/null
+++ b/sql/stats/2021-11-26_stats.txt
@@ -0,0 +1,424 @@
+
+Date: Sat 27 Nov 2021 03:33:30 AM UTC
+
+## SQL Table Sizes
+
+ Size: 937.28G
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ ------------------------------------+------------+--------------+------------
+ "public"."crossref" | 393 GB | 10127 MB | 403 GB
+ "public"."ingest_request" | 44 GB | 41 GB | 84 GB
+ "public"."cdx" | 52 GB | 28 GB | 80 GB
+ "public"."grobid" | 72 GB | 6963 MB | 79 GB
+ "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."file_meta" | 34 GB | 21 GB | 55 GB
+ "public"."pdf_meta" | 20 GB | 5869 MB | 26 GB
+ "public"."grobid_refs" | 19 GB | 1690 MB | 21 GB
+ "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+ (16 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 179761501 | 244453538203113
+
+ # 179m files, 244 TB
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 179376819
+ application/octet-stream | 155379
+ text/html | 116102
+ application/xml | 42170
+ application/xhtml+xml | 24347
+ text/plain | 15990
+ application/jats+xml | 6899
+ application/gzip | 6491
+ | 6034
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ application/x-bzip2 | 891
+ image/jpeg | 794
+ image/gif | 389
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 303
+ application/x-compress | 272
+ application/zip | 131
+ image/png | 121
+ application/CDFV2-unknown | 99
+ application/mac-binhex40 | 79
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 57
+ application/x-dosexec | 51
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ text/rtf | 33
+ application/x-dvi | 29
+ application/x-rar | 29
+ video/mp4 | 29
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 62196
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 119049962 | 149169240
+
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 137271670
+ warc/revisit | 9709493
+ application/octet-stream | 590443
+ text/xml | 525481
+ text/html | 421030
+ unk | 207442
+ application/postscript | 81123
+ application/save | 80988
+ binary/octet-stream | 67476
+ image/pdf | 39419
+ application/x-download | 38278
+ text/plain | 36159
+ application/download | 34328
+ application/force-download | 19729
+ multipart/form-data | 9105
+ application | 5299
+ application/x-msdownload | 3851
+ application/x-octetstream | 3649
+ .pdf | 3318
+ application/x-pdf | 2992
+ pdf | 1484
+ file | 1364
+ application/binary | 1354
+ file/unknown | 1345
+ application/pdf' | 1196
+ application/octetstream | 1029
+ application/unknown | 1000
+ 0 | 764
+ text/pdf | 704
+ application/blob | 673
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files FROM grobid;
+
+ total_files
+ -------------
+ 111236904
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+-----------
+ 200 | 102962304
+ 500 | 8269129
+ -4 | 5013
+ 503 | 548
+
+TODO: how many failed, by mimetype? to check if we are (or have) run non-PDF
+files through by mistake
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------------+----------
+ 0.5.5-fatcat | 89983404
+ | 12892161
+ 0.7.0-104-gbeebd9a6b | 86739
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | mag | 43701948
+ pdf | unpaywall | 37802895
+ pdf | doi | 28736398
+ pdf | doaj | 4264610
+ html | doaj | 2429003
+ pdf | pmc | 2383398
+ pdf | arxiv | 2330054
+ html | doi | 39725
+ xml | doaj | 9442
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 689
+ html | spn | 48
+ xml | spn | 1
+ (15 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | mag | mag-corpus | 43701948
+ pdf | unpaywall | unpaywall | 37802895
+ pdf | doi | fatcat-changelog | 16207728
+ pdf | doi | fatcat-ingest | 9012282
+ pdf | doaj | doaj | 4264610
+ pdf | doi | fatcat-ingest-container | 3515873
+ html | doaj | doaj | 2429003
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1767705
+ pdf | arxiv | fatcat-changelog | 562320
+ pdf | pmc | fatcat-ingest | 297527
+ pdf | pmc | fatcat-changelog | 57046
+ html | doi | fatcat-ingest | 37788
+ xml | doaj | doaj | 9442
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ html | doi | fatcat-changelog | 1897
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 689
+ pdf | doi | savepapernow-web | 613
+ html | spn | savepapernow-web | 48
+ html | doi | savepapernow-web | 40
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | arxiv | savepapernow-web | 3
+ xml | spn | savepapernow-web | 1
+ (25 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 169076
+ pdf | oai | 15283
+ pdf | doaj | 2063
+ html | doaj | 620
+ pdf | doi | 22
+ pdf | unpaywall | 17
+
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 14554221 | 0.284
+ pdf | mag | 43701948 | 32643175 | 0.747
+ pdf | unpaywall | 37802895 | 29989257 | 0.793
+ pdf | doi | 28736547 | 7690393 | 0.268
+ pdf | doaj | 4264610 | 2851601 | 0.669
+ html | doaj | 2429003 | 122937 | 0.051
+ pdf | pmc | 2383398 | 1821071 | 0.764
+ pdf | arxiv | 2330054 | 2159738 | 0.927
+ html | doi | 39725 | 1235 | 0.031
+ xml | doaj | 9442 | 6897 | 0.730
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 689 | 503 | 0.730
+ html | spn | 48 | 5 | 0.104
+ xml | spn | 1 | 0 | 0.000
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+
+ ingest_type | status | count
+ -------------+-------------------------------+----------
+ pdf | success | 78944243
+ pdf | no-pdf-link | 26270027
+ pdf | no-capture | 23267156
+ pdf | redirect-loop | 9837466
+ pdf | terminal-bad-status | 4147454
+ pdf | skip-url-blocklist | 3088907
+ pdf | link-loop | 2953891
+ pdf | blocked-cookie | 1855541
+ html | wrong-scope | 1106171
+ pdf | wrong-mimetype | 859941
+ pdf | gateway-timeout | 729771
+ pdf | spn2-cdx-lookup-failure | 584856
+ html | no-capture | 423917
+ pdf | forbidden | 390804
+ pdf | cdx-error | 363091
+ pdf | wayback-content-error | 354894
+ pdf | null-body | 341698
+ pdf | too-many-redirects | 307096
+ pdf | not-found | 294592
+ html | redirect-loop | 213032
+ html | unknown-scope | 207923
+ pdf | spn2-error | 192046
+ html | html-resource-no-capture | 166119
+ html | success | 124177
+ pdf | wayback-error | 105385
+ html | null-body | 100296
+ pdf | spn2-wayback-error | 73176
+ pdf | remote-server-error | 60908
+ pdf | spn2-error:too-many-redirects | 58076
+ pdf | skip-wall | 57744
+ html | wayback-content-error | 53928
+ pdf | read-timeout | 42465
+ pdf | invalid-host-resolution | 37221
+ pdf | petabox-error | 28765
+ pdf | spn2-error:unknown | 23885
+ html | wrong-mimetype | 18930
+ pdf | bad-redirect | 14708
+ html | terminal-bad-status | 14070
+ html | petabox-error | 13770
+ html | spn2-cdx-lookup-failure | 13002
+ pdf | spn2-error:job-failed | 9721
+ html | cdx-error | 7167
+ xml | success | 6897
+ pdf | spn2-error:bad-request | 4433
+ pdf | spn-remote-error | 4206
+ pdf | body-too-large | 3019
+ xml | null-body | 2353
+ pdf | other-mimetype | 2304
+ pdf | error | 1900
+ pdf | spn2-error:proxy-error | 1850
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 36821458
+ pdf | | 26058729
+ pdf | 301 | 8466302
+ html | 200 | 1676730
+ pdf | 503 | 1028504
+ pdf | 302 | 949465
+ pdf | 403 | 936737
+ pdf | 404 | 687661
+ pdf | 400 | 507303
+ html | | 439356
+ pdf | 401 | 288994
+ pdf | 500 | 263775
+ html | 301 | 211796
+ pdf | 303 | 130719
+ pdf | 410 | 66495
+ pdf | 502 | 41760
+ pdf | 429 | 35266
+ pdf | 420 | 26722
+ pdf | 409 | 15204
+ pdf | 509 | 15113
+ pdf | 999 | 11409
+ html | 404 | 9578
+ pdf | 307 | 8404
+ pdf | 308 | 5514
+ pdf | 202 | 4724
+ html | 500 | 3628
+ xml | 200 | 2537
+ pdf | 520 | 2199
+ pdf | 206 | 1694
+ html | 302 | 1138
+ pdf | 504 | 1124
+ pdf | 521 | 1085
+ pdf | 412 | 921
+ pdf | 421 | 714
+ pdf | 300 | 461
+ pdf | 505 | 436
+ pdf | 406 | 427
+ pdf | 508 | 408
+ html | 403 | 382
+ html | 503 | 378
+ html | 303 | 268
+ pdf | 204 | 252
+ pdf | 226 | 166
+ pdf | 402 | 70
+ html | 502 | 68
+ pdf | 523 | 55
+ pdf | 408 | 53
+ pdf | 432 | 45
+ pdf | 530 | 31
+ pdf | 416 | 31
+ (50 rows)
diff --git a/sql/stats/2021-12-02_table_sizes.txt b/sql/stats/2021-12-02_table_sizes.txt
new file mode 100644
index 0000000..b03c370
--- /dev/null
+++ b/sql/stats/2021-12-02_table_sizes.txt
@@ -0,0 +1,22 @@
+
+Size: 940.66G
+
+ table_name | table_size | indexes_size | total_size
+------------------------------------+------------+--------------+------------
+ "public"."crossref" | 394 GB | 10138 MB | 404 GB
+ "public"."ingest_request" | 44 GB | 41 GB | 85 GB
+ "public"."cdx" | 52 GB | 28 GB | 80 GB
+ "public"."grobid" | 72 GB | 6978 MB | 79 GB
+ "public"."ingest_file_result" | 38 GB | 41 GB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."file_meta" | 34 GB | 21 GB | 55 GB
+ "public"."pdf_meta" | 20 GB | 5930 MB | 26 GB
+ "public"."grobid_refs" | 19 GB | 1752 MB | 21 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+(16 rows)
diff --git a/sql/stats/2022-04-26_stats.txt b/sql/stats/2022-04-26_stats.txt
new file mode 100644
index 0000000..bd20c5c
--- /dev/null
+++ b/sql/stats/2022-04-26_stats.txt
@@ -0,0 +1,432 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ ------------------------------------+------------+--------------+------------
+ "public"."crossref" | 416 GB | 10 GB | 426 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 58 GB | 41 GB | 99 GB
+ "public"."ingest_request" | 50 GB | 48 GB | 98 GB
+ "public"."ingest_file_result" | 42 GB | 48 GB | 90 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."file_meta" | 37 GB | 34 GB | 71 GB
+ "public"."pdf_meta" | 21 GB | 7386 MB | 29 GB
+ "public"."grobid_refs" | 23 GB | 2516 MB | 26 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 3015 MB | 31 MB | 3046 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+ (16 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 192402128 | 271919997557597
+ (1 row)
+
+ # 271,919,997,557,597 -> ~272 TByte
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 191760695
+ text/html | 330351
+ application/octet-stream | 186696
+ application/xml | 42170
+ application/xhtml+xml | 31470
+ text/plain | 16449
+ application/jats+xml | 6902
+ application/gzip | 6681
+ | 6033
+ application/postscript | 4916
+ image/jpeg | 2901
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 934
+ application/x-bzip2 | 891
+ image/png | 476
+ application/x-dosexec | 404
+ image/gif | 395
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 374
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 294
+ application/x-compress | 274
+ video/mp4 | 150
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ application/mac-binhex40 | 79
+ application/zlib | 68
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ image/g3fax | 35
+ text/rtf | 33
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 12831
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 130732381 | 162760251
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 149749828
+ warc/revisit | 10437210
+ application/octet-stream | 733161
+ text/html | 642992
+ text/xml | 525483
+ unk | 217642
+ application/postscript | 81127
+ application/save | 81023
+ binary/octet-stream | 67938
+ application/x-download | 41137
+ image/pdf | 39712
+ application/download | 37153
+ text/plain | 36342
+ application/force-download | 21496
+ multipart/form-data | 9792
+ application | 5366
+ application/x-octetstream | 5166
+ application/x-msdownload | 3851
+ .pdf | 3445
+ application/x-pdf | 3018
+ pdf | 1618
+ file | 1370
+ application/binary | 1354
+ file/unknown | 1345
+ application/pdf' | 1196
+ application/octetstream | 1047
+ application/unknown | 1001
+ 0 | 773
+ text/pdf | 729
+ application/blob | 673
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files FROM grobid;
+
+ total_files
+ -------------
+ 123669603
+ (1 row)
+
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+-----------
+ 200 | 115668412
+ 500 | 7995428
+ -4 | 5745
+ 503 | 18
+ (4 rows)
+
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------------+----------
+ 0.7.0-131-gdd0251d9f | 54780825
+ 0.5.5-fatcat | 48003940
+ | 12694404
+ 0.7.0-104-gbeebd9a6b | 189243
+ (4 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | unpaywall | 43932525
+ pdf | mag | 43701948
+ pdf | doi | 40044585
+ pdf | doaj | 6016771
+ html | doaj | 3648181
+ pdf | arxiv | 2676200
+ pdf | pmc | 2402453
+ html | doi | 41492
+ xml | doaj | 20638
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 829
+ html | spn | 52
+ xml | doi | 1
+ xml | spn | 1
+ (16 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | unpaywall | unpaywall | 43932525
+ pdf | mag | mag-corpus | 43701948
+ pdf | doi | fatcat-changelog | 20936949
+ pdf | doi | fatcat-ingest | 15590201
+ pdf | doaj | doaj | 6016771
+ html | doaj | doaj | 3648181
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1984766
+ pdf | arxiv | fatcat-changelog | 691405
+ pdf | pmc | fatcat-ingest | 297646
+ pdf | pmc | fatcat-changelog | 75982
+ html | doi | fatcat-ingest | 37904
+ xml | doaj | doaj | 20638
+ html | doi | fatcat-changelog | 3534
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | doi | savepapernow-web | 1562
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 829
+ html | doi | savepapernow-web | 54
+ html | spn | savepapernow-web | 52
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | arxiv | savepapernow-web | 3
+ xml | doi | savepapernow-web | 1
+ xml | spn | savepapernow-web | 1
+ (26 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doaj | 1619621
+ html | doaj | 1208412
+ pdf | mag | 167653
+ pdf | oai | 15282
+ xml | doaj | 11196
+ pdf | unpaywall | 270
+ pdf | doi | 22
+ (7 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 15968290 | 0.312
+ pdf | unpaywall | 43932525 | 32618045 | 0.742
+ pdf | mag | 43701948 | 32662926 | 0.747
+ pdf | doi | 40044738 | 10925369 | 0.273
+ pdf | doaj | 6016771 | 3042569 | 0.506
+ html | doaj | 3648181 | 344208 | 0.094
+ pdf | arxiv | 2676206 | 2269708 | 0.848
+ pdf | pmc | 2402453 | 1855679 | 0.772
+ html | doi | 41492 | 1739 | 0.042
+ xml | doaj | 20638 | 6899 | 0.334
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 829 | 616 | 0.743
+ html | spn | 52 | 7 | 0.135
+ xml | doi | 1 | 0 | 0.000
+ xml | spn | 1 | 0 | 0.000
+ (16 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+---------------------------------+----------
+ pdf | success | 85709322
+ pdf | no-pdf-link | 29713304
+ pdf | no-capture | 26632191
+ pdf | redirect-loop | 10979145
+ pdf | terminal-bad-status | 4977000
+ pdf | link-loop | 3434877
+ pdf | skip-url-blocklist | 3114258
+ pdf | blocked-cookie | 2156835
+ html | wrong-scope | 1126911
+ pdf | wrong-mimetype | 980546
+ pdf | gateway-timeout | 651562
+ pdf | spn2-cdx-lookup-failure | 484016
+ pdf | spn2-backoff | 399382
+ pdf | cdx-error | 373964
+ pdf | wayback-content-error | 354370
+ html | success | 345860
+ pdf | null-body | 336182
+ pdf | spn2-error:500 | 309755
+ pdf | forbidden | 291175
+ pdf | not-found | 275560
+ pdf | too-many-redirects | 262312
+ html | unknown-scope | 230352
+ html | redirect-loop | 226596
+ html | html-resource-no-capture | 205646
+ html | no-capture | 164014
+ component | spn2-cdx-lookup-failure | 148825
+ component | wrong-mimetype | 130344
+ html | null-body | 100296
+ pdf | wayback-error | 94286
+ pdf | spn2-wayback-error | 81365
+ component | no-capture | 75278
+ pdf | spn2-error | 69830
+ pdf | skip-wall | 57744
+ pdf | spn2-error:too-many-redirects | 53808
+ pdf | remote-server-error | 41286
+ pdf | petabox-error | 38800
+ pdf | invalid-host-resolution | 37337
+ pdf | read-timeout | 36872
+ component | spn2-backoff | 33217
+ pdf | empty-blob | 27946
+ component | spn2-error | 24078
+ pdf | spn2-error:unknown | 23697
+ component | gateway-timeout | 23139
+ html | wrong-mimetype | 22731
+ html | wayback-content-error | 20507
+ pdf | spn2-error:host-crawling-paused | 19900
+ pdf | bad-redirect | 19183
+ html | terminal-bad-status | 13354
+ component | blocked-cookie | 12287
+ component | spn2-error:500 | 11271
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 38144779
+ pdf | | 32762240
+ pdf | 301 | 9433087
+ html | 200 | 1716127
+ pdf | 403 | 1416632
+ pdf | 302 | 1134668
+ pdf | 404 | 888853
+ pdf | 401 | 746311
+ pdf | 503 | 655894
+ pdf | 400 | 531479
+ component | | 337603
+ pdf | 500 | 247944
+ html | 301 | 224237
+ html | | 167194
+ pdf | 303 | 135048
+ component | 200 | 130663
+ pdf | 429 | 93489
+ pdf | 410 | 67392
+ pdf | 420 | 26722
+ pdf | 502 | 18770
+ pdf | 409 | 15152
+ pdf | 509 | 15113
+ pdf | 999 | 11747
+ html | 404 | 9879
+ pdf | 307 | 8895
+ pdf | 412 | 7053
+ pdf | 308 | 6627
+ pdf | 202 | 5289
+ xml | 200 | 2540
+ html | 500 | 2480
+ pdf | 520 | 2220
+ pdf | 521 | 1844
+ pdf | 206 | 1739
+ html | 302 | 1407
+ pdf | 504 | 1146
+ html | 303 | 1123
+ pdf | 421 | 986
+ pdf | 406 | 938
+ pdf | 204 | 498
+ pdf | 505 | 468
+ pdf | 300 | 436
+ pdf | 508 | 422
+ pdf | 426 | 405
+ html | 429 | 402
+ html | 403 | 398
+ pdf | 432 | 366
+ component | 301 | 294
+ pdf | 405 | 210
+ pdf | 226 | 166
+ component | 302 | 128
+ (50 rows)
+
diff --git a/sql/stats/2022-04-27_crawl_changelog.txt b/sql/stats/2022-04-27_crawl_changelog.txt
new file mode 100644
index 0000000..864abd4
--- /dev/null
+++ b/sql/stats/2022-04-27_crawl_changelog.txt
@@ -0,0 +1,191 @@
+ domain | status | count
+--------------------------------------+-------------------------+--------
+ academic.oup.com | | 1243
+ academic.oup.com | spn2-cdx-lookup-failure | 990
+ aip.scitation.org | | 313
+ aip.scitation.org | spn2-cdx-lookup-failure | 224
+ ajps.uomustansiriyah.edu.iq | | 235
+ apps.crossref.org | | 1329
+ apps.crossref.org | spn2-cdx-lookup-failure | 942
+ apps.crossref.org | no-pdf-link | 387
+ archaeologydataservice.ac.uk | | 422
+ archaeologydataservice.ac.uk | spn2-cdx-lookup-failure | 289
+ arxiv.org | | 3512
+ arxiv.org | spn2-cdx-lookup-failure | 2319
+ arxiv.org | success | 1177
+ assets.researchsquare.com | | 571
+ assets.researchsquare.com | spn2-cdx-lookup-failure | 322
+ assets.researchsquare.com | success | 249
+ brill.com | | 397
+ brill.com | spn2-cdx-lookup-failure | 265
+ cla.berkeley.edu | | 239
+ classiques-garnier.com | | 249
+ cyberleninka.ru | | 340
+ cyberleninka.ru | spn2-cdx-lookup-failure | 244
+ dergipark.org.tr | | 468
+ dergipark.org.tr | spn2-cdx-lookup-failure | 333
+ dl.acm.org | | 592
+ dl.acm.org | spn2-cdx-lookup-failure | 470
+ doi.ala.org.au | | 288
+ doi.ala.org.au | spn2-cdx-lookup-failure | 220
+ doi.org | | 1107
+ doi.org | terminal-bad-status | 679
+ doi.org | spn2-cdx-lookup-failure | 415
+ downloads.hindawi.com | | 279
+ downloads.hindawi.com | success | 267
+ edbs.uomustansiriyah.edu.iq | | 294
+ edbs.uomustansiriyah.edu.iq | spn2-cdx-lookup-failure | 209
+ elibrary.kdpu.edu.ua | | 320
+ elibrary.kdpu.edu.ua | spn2-cdx-lookup-failure | 233
+ elibrary.ru | | 722
+ elibrary.ru | spn2-cdx-lookup-failure | 505
+ europepmc.org | | 986
+ europepmc.org | spn2-cdx-lookup-failure | 681
+ europepmc.org | success | 291
+ figshare.com | | 377
+ figshare.com | spn2-cdx-lookup-failure | 328
+ fjfsdata01prod.blob.core.windows.net | | 255
+ fjfsdata01prod.blob.core.windows.net | spn2-cdx-lookup-failure | 216
+ hammer.purdue.edu | | 224
+ ieeexplore.ieee.org | | 3904
+ ieeexplore.ieee.org | spn2-cdx-lookup-failure | 2654
+ ieeexplore.ieee.org | gateway-timeout | 792
+ ieeexplore.ieee.org | spn2-backoff | 419
+ journals.eco-vector.com | | 428
+ journals.eco-vector.com | spn2-cdx-lookup-failure | 306
+ journals.lww.com | | 727
+ journals.lww.com | spn2-cdx-lookup-failure | 622
+ journals.openedition.org | | 806
+ journals.openedition.org | spn2-cdx-lookup-failure | 554
+ journals.plos.org | | 348
+ journals.plos.org | spn2-cdx-lookup-failure | 244
+ kiss.kstudy.com | | 226
+ kluwerlawonline.com | | 723
+ kluwerlawonline.com | spn2-cdx-lookup-failure | 489
+ kluwerlawonline.com | link-loop | 203
+ linkinghub.elsevier.com | | 401
+ linkinghub.elsevier.com | spn2-backoff | 342
+ mdpi-res.com | | 1463
+ mdpi-res.com | success | 1337
+ muse.jhu.edu | | 346
+ muse.jhu.edu | spn2-cdx-lookup-failure | 253
+ onepetro.org | | 363
+ onepetro.org | spn2-cdx-lookup-failure | 284
+ online.ucpress.edu | | 1620
+ online.ucpress.edu | spn2-cdx-lookup-failure | 1511
+ onlinelibrary.wiley.com | | 2913
+ onlinelibrary.wiley.com | spn2-cdx-lookup-failure | 2109
+ onlinelibrary.wiley.com | terminal-bad-status | 787
+ opendata.uni-halle.de | | 519
+ opendata.uni-halle.de | spn2-cdx-lookup-failure | 343
+ osf.io | | 1554
+ osf.io | spn2-cdx-lookup-failure | 1350
+ papers.ssrn.com | | 2207
+ papers.ssrn.com | spn2-cdx-lookup-failure | 1727
+ papers.ssrn.com | link-loop | 457
+ psycharchives.org | | 384
+ psycharchives.org | spn2-cdx-lookup-failure | 283
+ publons.com | | 493
+ publons.com | spn2-cdx-lookup-failure | 348
+ pubs.acs.org | | 1240
+ pubs.acs.org | spn2-cdx-lookup-failure | 881
+ pubs.acs.org | terminal-bad-status | 298
+ pubs.rsc.org | | 603
+ pubs.rsc.org | spn2-cdx-lookup-failure | 460
+ repositories.lib.utexas.edu | | 1861
+ repositories.lib.utexas.edu | spn2-cdx-lookup-failure | 1288
+ repositories.lib.utexas.edu | terminal-bad-status | 523
+ s3-eu-west-1.amazonaws.com | | 216
+ sage.figshare.com | | 374
+ sage.figshare.com | spn2-cdx-lookup-failure | 309
+ scholar.dkyobobook.co.kr | | 220
+ scholarworks.gsu.edu | | 749
+ scholarworks.gsu.edu | spn2-cdx-lookup-failure | 577
+ tandf.figshare.com | | 214
+ www.atlantis-press.com | | 338
+ www.atlantis-press.com | spn2-cdx-lookup-failure | 214
+ www.cairn.info | | 782
+ www.cairn.info | spn2-cdx-lookup-failure | 541
+ www.cambridge.org | | 2325
+ www.cambridge.org | spn2-cdx-lookup-failure | 1787
+ www.cambridge.org | no-pdf-link | 300
+ www.cell.com | | 213
+ www.concrete.org | | 476
+ www.concrete.org | spn2-cdx-lookup-failure | 340
+ www.dbpia.co.kr | | 375
+ www.dbpia.co.kr | spn2-cdx-lookup-failure | 275
+ www.degruyter.com | | 3849
+ www.degruyter.com | spn2-cdx-lookup-failure | 2969
+ www.degruyter.com | no-pdf-link | 712
+ www.dib.ie | | 1100
+ www.dib.ie | spn2-cdx-lookup-failure | 1038
+ www.e-periodica.ch | | 821
+ www.e-periodica.ch | spn2-cdx-lookup-failure | 620
+ www.e-periodica.ch | no-pdf-link | 201
+ www.elibrary.ru | | 401
+ www.elibrary.ru | spn2-cdx-lookup-failure | 281
+ www.emerald.com | | 390
+ www.emerald.com | spn2-cdx-lookup-failure | 275
+ www.eurekaselect.com | | 275
+ www.frontiersin.org | | 1266
+ www.frontiersin.org | spn2-cdx-lookup-failure | 1025
+ www.hanspub.org | | 229
+ www.hindawi.com | | 604
+ www.hindawi.com | spn2-cdx-lookup-failure | 594
+ www.inderscience.com | | 201
+ www.jstage.jst.go.jp | | 1094
+ www.jstage.jst.go.jp | spn2-cdx-lookup-failure | 807
+ www.jstage.jst.go.jp | success | 206
+ www.mdpi.com | | 4340
+ www.mdpi.com | spn2-cdx-lookup-failure | 4258
+ www.nomos-elibrary.de | | 2749
+ www.nomos-elibrary.de | spn2-cdx-lookup-failure | 1909
+ www.nomos-elibrary.de | redirect-loop | 819
+ www.osti.gov | | 275
+ www.oxfordhandbooks.com | | 248
+ www.oxfordhandbooks.com | spn2-cdx-lookup-failure | 224
+ www.pdcnet.org | | 217
+ www.researchsquare.com | | 483
+ www.researchsquare.com | spn2-cdx-lookup-failure | 317
+ www.scielo.br | | 319
+ www.scielo.br | spn2-cdx-lookup-failure | 222
+ www.sciencedirect.com | | 3384
+ www.sciencedirect.com | spn2-cdx-lookup-failure | 3267
+ www.spiedigitallibrary.org | | 441
+ www.spiedigitallibrary.org | spn2-cdx-lookup-failure | 327
+ www.tandfonline.com | | 2401
+ www.tandfonline.com | spn2-cdx-lookup-failure | 1552
+ www.tandfonline.com | no-pdf-link | 303
+ www.tandfonline.com | blocked-cookie | 250
+ www.taylorfrancis.com | | 1232
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 908
+ www.thieme-connect.de | | 520
+ www.thieme-connect.de | spn2-cdx-lookup-failure | 366
+ www.worldscientific.com | | 383
+ www.worldscientific.com | spn2-cdx-lookup-failure | 276
+ zenodo.org | | 10625
+ zenodo.org | spn2-cdx-lookup-failure | 7777
+ zenodo.org | success | 1574
+ zenodo.org | no-pdf-link | 1160
+ zivahub.uct.ac.za | | 3428
+ zivahub.uct.ac.za | spn2-cdx-lookup-failure | 2845
+ zivahub.uct.ac.za | no-pdf-link | 583
+ | | 130491
+ | spn2-cdx-lookup-failure | 95169
+ | success | 13354
+ | no-pdf-link | 9621
+ | terminal-bad-status | 3385
+ | spn2-backoff | 2396
+ | redirect-loop | 2216
+ | link-loop | 1850
+ | gateway-timeout | 1061
+ | spn2-error:blocked-url | 428
+ | blocked-cookie | 415
+ | spn2-error | 246
+(182 rows)
+
+----
+
+The overwhelming thing is `spn2-cdx-lookup-failure`. Should check in after a
+week or two, when crawling and retries are running smoothly, and see what
+things look like then.
diff --git a/sql/stats/2022-05-11_crawl_changelog.txt b/sql/stats/2022-05-11_crawl_changelog.txt
new file mode 100644
index 0000000..8d98217
--- /dev/null
+++ b/sql/stats/2022-05-11_crawl_changelog.txt
@@ -0,0 +1,410 @@
+ domain | status | count
+-----------------------------------------------------------------+-------------------------+--------
+ academic.oup.com | | 2210
+ academic.oup.com | no-pdf-link | 1350
+ academic.oup.com | bad-redirect | 510
+ academiccommons.columbia.edu | | 379
+ academiccommons.columbia.edu | success | 339
+ aip.scitation.org | | 762
+ aip.scitation.org | terminal-bad-status | 430
+ apps.crossref.org | | 9894
+ apps.crossref.org | no-pdf-link | 9886
+ apps.euskadi.eus | | 242
+ apps.euskadi.eus | no-pdf-link | 240
+ arxiv.org | | 44889
+ arxiv.org | success | 28781
+ arxiv.org | spn2-backoff | 7975
+ arxiv.org | terminal-bad-status | 4508
+ arxiv.org | spn2-cdx-lookup-failure | 2010
+ arxiv.org | redirect-loop | 619
+ arxiv.org | no-pdf-link | 242
+ arxiv.org | spn2-error | 236
+ asa.scitation.org | | 356
+ asa.scitation.org | terminal-bad-status | 299
+ asmedigitalcollection.asme.org | | 240
+ assets.cureus.com | | 336
+ assets.cureus.com | success | 335
+ assets.researchsquare.com | | 1042
+ assets.researchsquare.com | success | 993
+ av.tib.eu | | 205
+ av.tib.eu | no-pdf-link | 203
+ bibliographie.uni-tuebingen.de | | 213
+ bibliographie.uni-tuebingen.de | no-pdf-link | 211
+ biorxiv.org | redirect-loop | 217
+ biorxiv.org | | 217
+ books.openedition.org | | 691
+ books.openedition.org | no-pdf-link | 687
+ boris.unibe.ch | | 525
+ boris.unibe.ch | success | 466
+ bridges.monash.edu | | 663
+ bridges.monash.edu | no-pdf-link | 647
+ brill.com | | 860
+ brill.com | success | 434
+ chemrxiv.org | | 201
+ classiques-garnier.com | | 242
+ content.iospress.com | | 325
+ content.iospress.com | link-loop | 247
+ core.tdar.org | | 216
+ core.tdar.org | no-pdf-link | 211
+ cyberleninka.ru | | 646
+ cyberleninka.ru | success | 620
+ d197for5662m48.cloudfront.net | | 263
+ d197for5662m48.cloudfront.net | success | 262
+ dergipark.org.tr | | 891
+ dergipark.org.tr | success | 526
+ dergipark.org.tr | no-pdf-link | 261
+ digi.ub.uni-heidelberg.de | | 427
+ digi.ub.uni-heidelberg.de | no-pdf-link | 427
+ direct.mit.edu | | 268
+ direct.mit.edu | no-pdf-link | 208
+ dl.acm.org | | 1719
+ dl.acm.org | success | 829
+ dl.acm.org | no-pdf-link | 546
+ dl.acm.org | terminal-bad-status | 205
+ dlc.library.columbia.edu | | 385
+ dlc.library.columbia.edu | terminal-bad-status | 319
+ doi.ala.org.au | | 724
+ doi.ala.org.au | no-pdf-link | 721
+ doi.apa.org | | 214
+ doi.org | | 3390
+ doi.org | terminal-bad-status | 2938
+ doi.org | redirect-loop | 233
+ doi.org | spn2-wayback-error | 208
+ doi.usp.org | | 325
+ doi.usp.org | no-pdf-link | 324
+ downloads.hindawi.com | | 1439
+ downloads.hindawi.com | success | 1436
+ du.diva-portal.org | | 589
+ du.diva-portal.org | success | 586
+ econtents.bc.unicamp.br | | 310
+ econtents.bc.unicamp.br | success | 310
+ ediss.uni-goettingen.de | | 728
+ ediss.uni-goettingen.de | success | 425
+ elibrary.kdpu.edu.ua | | 907
+ elibrary.kdpu.edu.ua | bad-redirect | 712
+ elibrary.ru | | 925
+ elibrary.ru | terminal-bad-status | 492
+ elibrary.ru | bad-redirect | 230
+ elibrary.vdi-verlag.de | | 393
+ elifesciences.org | | 296
+ elifesciences.org | success | 276
+ europepmc.org | | 3024
+ europepmc.org | success | 2541
+ europepmc.org | terminal-bad-status | 463
+ figshare.com | | 493
+ figshare.com | no-pdf-link | 440
+ files.osf.io | | 883
+ files.osf.io | success | 686
+ fjfsdata01prod.blob.core.windows.net | | 3869
+ fjfsdata01prod.blob.core.windows.net | success | 3818
+ ieeexplore.ieee.org | | 10854
+ ieeexplore.ieee.org | gateway-timeout | 5495
+ ieeexplore.ieee.org | spn2-backoff | 1662
+ ieeexplore.ieee.org | no-pdf-link | 1417
+ ieeexplore.ieee.org | success | 1410
+ ieeexplore.ieee.org | redirect-loop | 768
+ iiif.crossasia.org | | 7608
+ iiif.crossasia.org | no-pdf-link | 7568
+ ikee.lib.auth.gr | | 450
+ ikee.lib.auth.gr | success | 332
+ ins.journals.ekb.eg | | 212
+ iopscience.iop.org | | 268
+ jamanetwork.com | | 333
+ journals.aps.org | | 414
+ journals.asm.org | | 242
+ journals.flvc.org | | 245
+ journals.flvc.org | success | 242
+ journals.healio.com | | 755
+ journals.healio.com | terminal-bad-status | 668
+ journals.lincoln.ac.nz | | 244
+ journals.lincoln.ac.nz | success | 239
+ journals.lww.com | | 1772
+ journals.lww.com | link-loop | 1425
+ journals.lww.com | spn2-backoff | 209
+ journals.openedition.org | | 1192
+ journals.openedition.org | redirect-loop | 467
+ journals.openedition.org | success | 451
+ journals.plos.org | | 771
+ journals.plos.org | success | 750
+ journals.ub.uni-heidelberg.de | | 787
+ journals.ub.uni-heidelberg.de | success | 741
+ kazanmedjournal.ru | | 240
+ kazanmedjournal.ru | success | 231
+ kiss.kstudy.com | | 219
+ kiss.kstudy.com | no-pdf-link | 218
+ kluwerlawonline.com | | 444
+ kluwerlawonline.com | link-loop | 402
+ libraetd.lib.virginia.edu | | 362
+ libraetd.lib.virginia.edu | no-pdf-link | 361
+ link.springer.com | | 305
+ linkinghub.elsevier.com | | 568
+ linkinghub.elsevier.com | spn2-backoff | 545
+ ltu-figshare-repo.s3.aarnet.edu.au | | 269
+ ltu-figshare-repo.s3.aarnet.edu.au | success | 268
+ mausamjournal.imd.gov.in | | 202
+ mdpi-res.com | | 8892
+ mdpi-res.com | success | 8863
+ mededpublish.org | | 1900
+ mededpublish.org | no-pdf-link | 1900
+ meetingorganizer.copernicus.org | | 276
+ meetingorganizer.copernicus.org | no-pdf-link | 271
+ muse.jhu.edu | | 1047
+ muse.jhu.edu | terminal-bad-status | 755
+ muse.jhu.edu | link-loop | 203
+ online.ucpress.edu | | 358
+ online.ucpress.edu | link-loop | 212
+ onlinelibrary.wiley.com | | 5813
+ onlinelibrary.wiley.com | terminal-bad-status | 4587
+ onlinelibrary.wiley.com | spn2-wayback-error | 614
+ onlinelibrary.wiley.com | blocked-cookie | 381
+ open.library.ubc.ca | | 206
+ opendata.uni-halle.de | | 1768
+ opendata.uni-halle.de | success | 1215
+ opendata.uni-halle.de | wrong-mimetype | 260
+ opendata2.uni-halle.de | | 206
+ opg.optica.org | | 205
+ osf.io | | 2949
+ osf.io | no-pdf-link | 2404
+ osf.io | spn2-backoff | 299
+ papers.ssrn.com | | 3962
+ papers.ssrn.com | link-loop | 3800
+ peerj.com | | 273
+ preprints.jmir.org | | 275
+ preprints.jmir.org | cdx-error | 255
+ publikationen.bibliothek.kit.edu | | 213
+ publons.com | | 593
+ publons.com | no-pdf-link | 590
+ pubs.acs.org | | 2288
+ pubs.acs.org | terminal-bad-status | 1841
+ pubs.acs.org | spn2-wayback-error | 210
+ pubs.rsc.org | | 1698
+ pubs.rsc.org | bad-redirect | 811
+ pubs.rsc.org | link-loop | 352
+ pubs.rsc.org | success | 307
+ radiopaedia.org | | 220
+ read.dukeupress.edu | | 303
+ repositories.lib.utexas.edu | | 1570
+ repositories.lib.utexas.edu | bad-redirect | 513
+ repositories.lib.utexas.edu | spn2-backoff | 383
+ repositories.lib.utexas.edu | gateway-timeout | 379
+ repositories.lib.utexas.edu | terminal-bad-status | 282
+ repository.uj.ac.za | | 489
+ repository.uj.ac.za | no-pdf-link | 365
+ repository.unsworks.unsw.edu.au | | 397
+ repository.urosario.edu.co | | 2429
+ repository.urosario.edu.co | success | 1648
+ repository.urosario.edu.co | bad-redirect | 613
+ rex.libraries.wsu.edu | no-pdf-link | 241
+ rex.libraries.wsu.edu | | 241
+ rsdjournal.org | | 208
+ rsdjournal.org | success | 208
+ s3-ap-southeast-2.amazonaws.com | | 282
+ s3-ap-southeast-2.amazonaws.com | success | 277
+ s3-eu-west-1.amazonaws.com | | 4615
+ s3-eu-west-1.amazonaws.com | success | 4593
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 240
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 237
+ sage.figshare.com | | 415
+ sage.figshare.com | no-pdf-link | 385
+ scholar.dkyobobook.co.kr | | 512
+ scholar.dkyobobook.co.kr | no-pdf-link | 509
+ scholarlypublishingcollective.org | | 287
+ scholarworks.gsu.edu | | 1132
+ scholarworks.gsu.edu | success | 1000
+ scholarworks.iupui.edu | | 205
+ scholarworks.umass.edu | | 417
+ scholarworks.umass.edu | success | 400
+ sciencescholar.us | | 404
+ secure.jbs.elsevierhealth.com | | 727
+ secure.jbs.elsevierhealth.com | terminal-bad-status | 722
+ tandf.figshare.com | | 354
+ tandf.figshare.com | no-pdf-link | 342
+ unsworks.unsw.edu.au | | 408
+ unsworks.unsw.edu.au | spn2-cdx-lookup-failure | 342
+ valep.vc.univie.ac.at | no-pdf-link | 737
+ valep.vc.univie.ac.at | | 737
+ watermark.silverchair.com | | 1604
+ watermark.silverchair.com | success | 1598
+ wayf.switch.ch | | 215
+ wayf.switch.ch | no-pdf-link | 213
+ www.ahajournals.org | | 438
+ www.ahajournals.org | no-pdf-link | 306
+ www.ahbps.org | | 316
+ www.ahbps.org | success | 312
+ www.atenaeditora.com.br | | 390
+ www.atenaeditora.com.br | terminal-bad-status | 333
+ www.atlantis-press.com | | 914
+ www.atlantis-press.com | success | 901
+ www.atsjournals.org | | 1245
+ www.atsjournals.org | success | 1189
+ www.biorxiv.org | | 712
+ www.biorxiv.org | success | 670
+ www.bloomsburycollections.com | | 982
+ www.bloomsburycollections.com | terminal-bad-status | 566
+ www.cahiers-clsl.ch | | 305
+ www.cahiers-clsl.ch | success | 298
+ www.cairn.info | | 1799
+ www.cairn.info | no-pdf-link | 662
+ www.cairn.info | link-loop | 487
+ www.cairn.info | success | 355
+ www.cairn.info | terminal-bad-status | 267
+ www.cambridge.org | | 3258
+ www.cambridge.org | no-pdf-link | 1682
+ www.cambridge.org | success | 682
+ www.cambridge.org | bad-redirect | 404
+ www.cambridge.org | link-loop | 302
+ www.dbpia.co.kr | | 763
+ www.dbpia.co.kr | no-pdf-link | 443
+ www.dbpia.co.kr | redirect-loop | 287
+ www.degruyter.com | | 12655
+ www.degruyter.com | no-pdf-link | 9112
+ www.degruyter.com | success | 2898
+ www.degruyter.com | spn2-backoff | 507
+ www.dib.ie | | 1381
+ www.dib.ie | no-pdf-link | 1378
+ www.dovepress.com | | 231
+ www.dovepress.com | success | 216
+ www.e-manuscripta.ch | | 767
+ www.e-manuscripta.ch | success | 399
+ www.e-periodica.ch | | 1406
+ www.e-periodica.ch | no-pdf-link | 1402
+ www.e-rara.ch | no-pdf-link | 251
+ www.e-rara.ch | | 251
+ www.editoracientifica.org | no-pdf-link | 205
+ www.editoracientifica.org | | 205
+ www.elgaronline.com | | 427
+ www.elibrary.ru | | 616
+ www.elibrary.ru | terminal-bad-status | 364
+ www.elibrary.ru | no-pdf-link | 216
+ www.emerald.com | | 862
+ www.emerald.com | no-pdf-link | 724
+ www.endocrine-abstracts.org | | 1907
+ www.endocrine-abstracts.org | no-pdf-link | 1905
+ www.eurekaselect.com | | 285
+ www.eurekaselect.com | link-loop | 246
+ www.even3.com.br | | 233
+ www.frontiersin.org | | 585
+ www.frontiersin.org | spn2-backoff | 436
+ www.humankineticslibrary.com | no-pdf-link | 207
+ www.humankineticslibrary.com | | 207
+ www.igi-global.com | | 1600
+ www.igi-global.com | no-pdf-link | 1199
+ www.igi-global.com | bad-redirect | 258
+ www.inderscience.com | | 385
+ www.inderscience.com | no-pdf-link | 365
+ www.inderscienceonline.com | | 202
+ www.ingentaconnect.com | | 450
+ www.ingentaconnect.com | no-pdf-link | 260
+ www.jstage.jst.go.jp | | 1248
+ www.jstage.jst.go.jp | success | 870
+ www.karger.com | | 313
+ www.liebertpub.com | | 271
+ www.liebertpub.com | no-pdf-link | 241
+ www.nicecjournal.co.uk | | 274
+ www.nicecjournal.co.uk | success | 274
+ www.nomos-elibrary.de | | 1771
+ www.nomos-elibrary.de | no-pdf-link | 788
+ www.nomos-elibrary.de | redirect-loop | 506
+ www.nomos-elibrary.de | bad-redirect | 207
+ www.osti.gov | | 381
+ www.osti.gov | link-loop | 326
+ www.persee.fr | | 277
+ www.preprints.org | | 225
+ www.preprints.org | success | 225
+ www.protocols.io | | 770
+ www.protocols.io | success | 485
+ www.repository.cam.ac.uk | | 510
+ www.repository.cam.ac.uk | bad-redirect | 213
+ www.research-collection.ethz.ch | | 416
+ www.research-collection.ethz.ch | bad-redirect | 249
+ www.researchsquare.com | | 1121
+ www.researchsquare.com | bad-redirect | 985
+ www.scielo.br | | 828
+ www.scielo.br | success | 641
+ www.sciencedirect.com | | 8567
+ www.sciencedirect.com | terminal-bad-status | 5773
+ www.sciencedirect.com | spn2-wayback-error | 1590
+ www.sciencedirect.com | no-pdf-link | 576
+ www.sciencedirect.com | spn2-backoff | 479
+ www.sciendo.com | | 257
+ www.sciendo.com | success | 222
+ www.scitepress.org | | 381
+ www.scitepress.org | no-pdf-link | 377
+ www.spiedigitallibrary.org | | 1061
+ www.spiedigitallibrary.org | bad-redirect | 571
+ www.spiedigitallibrary.org | gateway-timeout | 233
+ www.tandfonline.com | | 4934
+ www.tandfonline.com | no-pdf-link | 2088
+ www.tandfonline.com | terminal-bad-status | 1282
+ www.tandfonline.com | blocked-cookie | 757
+ www.tandfonline.com | redirect-loop | 488
+ www.tandfonline.com | spn2-wayback-error | 202
+ www.taylorfrancis.com | | 3979
+ www.taylorfrancis.com | link-loop | 1928
+ www.taylorfrancis.com | no-pdf-link | 1840
+ www.techniques-ingenieur.fr | | 354
+ www.techniques-ingenieur.fr | no-pdf-link | 353
+ www.thieme-connect.de | | 1987
+ www.thieme-connect.de | no-pdf-link | 949
+ www.thieme-connect.de | link-loop | 869
+ www.tib.eu | no-pdf-link | 315
+ www.tib.eu | | 315
+ www.un-ilibrary.org | no-pdf-link | 352
+ www.un-ilibrary.org | | 352
+ www.worldscientific.com | | 668
+ www.worldscientific.com | no-pdf-link | 629
+ www.zora.uzh.ch | | 318
+ zenodo.org | | 46585
+ zenodo.org | no-pdf-link | 29519
+ zenodo.org | success | 14768
+ zenodo.org | terminal-bad-status | 810
+ zenodo.org | wrong-mimetype | 691
+ zenodo.org | spn2-cdx-lookup-failure | 395
+ zenodo.org | spn2-backoff | 294
+ zivahub.uct.ac.za | | 1909
+ zivahub.uct.ac.za | no-pdf-link | 1880
+ zop.zb.uzh.ch | | 228
+ zop.zb.uzh.ch | success | 217
+ | | 365582
+ | success | 141497 38.7%
+ | no-pdf-link | 120852 33.0%
+ | terminal-bad-status | 31900 8.7%
+ | spn2-backoff | 16979 4.6%
+ | link-loop | 13624 3.7%
+ | bad-redirect | 8736
+ | redirect-loop | 7405
+ | gateway-timeout | 6997
+ | spn2-cdx-lookup-failure | 5146
+ | spn2-wayback-error | 3708
+ | wrong-mimetype | 2158
+ | blocked-cookie | 1942
+ | spn2-error:blocked-url | 1733
+ | wayback-error | 1063
+ | spn2-error | 647
+ | spn2-error:500 | 265
+ | cdx-error | 257
+(383 rows)
+
+----
+
+365k in 7 days is about 52k a day, which is about expected. Around 5-7% need
+retries.
+
+important changes:
+- biorxiv.org: needs fix and then retries
+- academic.oup.com: should probably skip
+- apps.crossref.org: need to handle this in code
+- arxiv.org: should retry `terminal-bad-status` on PDFs; should also add support to extract PDF link from `/abs/`
+- doi.org: investigate redirect-loop and terminal-bad-status
+- osf.io: not getting PDFs
+- papers.ssrn.com: why are these attempted?
+- publons.com: not getting PDFs; special case these?
+- www.sciencedirect.com: not working at all?
+
+smaller:
+- bridges.monash.edu: fix, then retry?
+- dl.acm.org: some broader retries?
+- figshare.com: still some attempts, but almost all no-pdf-link
+- onlinelibrary.wiley.com: getting blocked broadly?
+- www.endocrine-abstracts.org: HTML content?
+- www.igi-global.com: no-pdf-link
diff --git a/sql/stats/2022-09-06_stats.txt b/sql/stats/2022-09-06_stats.txt
new file mode 100644
index 0000000..be2b30c
--- /dev/null
+++ b/sql/stats/2022-09-06_stats.txt
@@ -0,0 +1,438 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ ------------------------------------+------------+--------------+------------
+ "public"."crossref" | 459 GB | 10 GB | 470 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 62 GB | 44 GB | 106 GB
+ "public"."ingest_request" | 51 GB | 50 GB | 101 GB
+ "public"."ingest_file_result" | 44 GB | 52 GB | 96 GB
+ "public"."file_meta" | 39 GB | 39 GB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."pdf_meta" | 23 GB | 7466 MB | 31 GB
+ "public"."grobid_refs" | 27 GB | 3089 MB | 30 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 7469 MB | 66 MB | 7535 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+ (16 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 198175106 | 282695671015403
+ (1 row)
+
+ 198 million files, 282 TBytes.
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 197021437
+ text/html | 830331
+ application/octet-stream | 186669
+ application/xml | 42170
+ application/xhtml+xml | 38207
+ text/plain | 16471
+ application/jats+xml | 10385
+ application/gzip | 6681
+ | 6032
+ application/postscript | 4916
+ image/jpeg | 4522
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 946
+ application/x-bzip2 | 891
+ image/png | 659
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 440
+ application/x-dosexec | 404
+ image/gif | 395
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 382
+ application/x-compress | 274
+ video/mp4 | 218
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ application/mac-binhex40 | 79
+ application/zlib | 68
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ image/g3fax | 35
+ text/rtf | 33
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 12800
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 137283420 | 172140506
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 157465613
+ warc/revisit | 11337336
+ text/html | 1137208
+ application/octet-stream | 950380
+ text/xml | 528965
+ unk | 253294
+ application/postscript | 81130
+ application/save | 81069
+ binary/octet-stream | 68942
+ application/x-download | 42717
+ application/download | 40628
+ image/pdf | 39904
+ text/plain | 36445
+ application/force-download | 24148
+ multipart/form-data | 10972
+ application | 5409
+ application/x-octetstream | 5192
+ application/x-msdownload | 3854
+ .pdf | 3518
+ application/x-pdf | 3061
+ application/octet | 1792
+ pdf | 1757
+ application/binary | 1399
+ file | 1373
+ file/unknown | 1345
+ application/pdf' | 1196
+ application/octetstream | 1087
+ application/unknown | 1005
+ 0 | 773
+ text/pdf | 729
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files FROM grobid;
+
+ total_files
+ -------------
+ 129001717
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+-----------
+ 200 | 120797098
+ 500 | 8198783
+ -4 | 5802
+ 503 | 36
+ (4 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------------+----------
+ 0.7.0-131-gdd0251d9f | 60469462
+ 0.5.5-fatcat | 47472904
+ | 12665498
+ 0.7.0-104-gbeebd9a6b | 189243
+ (4 rows)
+
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | unpaywall | 43932525
+ pdf | doi | 43852308
+ pdf | mag | 43701948
+ pdf | doaj | 6534341
+ html | doaj | 3987669
+ pdf | arxiv | 2784589
+ pdf | pmc | 2439181
+ pdf | dblp | 631716
+ html | doi | 126699
+ xml | doaj | 23066
+ pdf | cnki_covid19 | 2034
+ pdf | spn | 1026
+ pdf | wanfang_covid19 | 975
+ html | spn | 65
+ xml | spn | 2
+ xml | doi | 1
+ (17 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | unpaywall | unpaywall | 43932525
+ pdf | mag | mag-corpus | 43701948
+ pdf | doi | fatcat-changelog | 24742500
+ pdf | doi | fatcat-ingest | 15592121
+ pdf | doaj | doaj | 6484737
+ html | doaj | doaj | 3987468
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1984766
+ pdf | arxiv | fatcat-changelog | 799793
+ pdf | dblp | dblp | 631716
+ pdf | pmc | fatcat-ingest | 297980
+ html | doi | fatcat-ingest | 121508
+ pdf | pmc | fatcat-changelog | 112376
+ pdf | doaj | fatcat-changelog | 47181
+ xml | doaj | doaj | 23066
+ html | doi | fatcat-changelog | 5129
+ pdf | doaj | fatcat-ingest | 2423
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | doi | savepapernow-web | 1814
+ pdf | spn | savepapernow-web | 1026
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ html | doaj | fatcat-ingest | 201
+ html | spn | savepapernow-web | 65
+ html | doi | savepapernow-web | 62
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | arxiv | savepapernow-web | 4
+ xml | spn | savepapernow-web | 2
+ xml | doi | savepapernow-web | 1
+ (30 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 167653
+ pdf | doaj | 81517
+ pdf | oai | 15282
+ html | doaj | 1791
+ pdf | unpaywall | 270
+ pdf | doi | 22
+ (6 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 16024068 | 0.313
+ pdf | unpaywall | 43932525 | 36045446 | 0.820
+ pdf | doi | 43852308 | 14956080 | 0.341
+ pdf | mag | 43701948 | 32768484 | 0.750
+ pdf | doaj | 6534341 | 4704066 | 0.720
+ html | doaj | 3987669 | 778165 | 0.195
+ pdf | arxiv | 2784589 | 2419941 | 0.869
+ pdf | pmc | 2439181 | 1897671 | 0.778
+ pdf | dblp | 631716 | 305142 | 0.483
+ html | doi | 126699 | 75754 | 0.598
+ xml | doaj | 23066 | 10381 | 0.450
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | spn | 1026 | 778 | 0.758
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ html | spn | 65 | 13 | 0.200
+ xml | spn | 2 | 1 | 0.500
+ xml | doi | 1 | 0 | 0.000
+ (17 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+-------------------------------+----------
+ pdf | success | 94887295
+ pdf | no-pdf-link | 33960080
+ pdf | no-capture | 20893916
+ pdf | terminal-bad-status | 6973765
+ pdf | redirect-loop | 5775175
+ pdf | link-loop | 4095424
+ pdf | skip-url-blocklist | 4037518
+ pdf | blocked-cookie | 3508762
+ html | wrong-scope | 1783694
+ pdf | wrong-mimetype | 1379673
+ html | success | 853762
+ pdf | gateway-timeout | 635170
+ html | no-capture | 381283
+ pdf | wayback-content-error | 356694
+ pdf | cdx-error | 347700
+ pdf | null-body | 336166
+ html | unknown-scope | 321874
+ html | html-resource-no-capture | 294294
+ pdf | forbidden | 291127
+ pdf | not-found | 274343
+ pdf | too-many-redirects | 264494
+ component | wrong-mimetype | 196680
+ component | spn2-cdx-lookup-failure | 173615
+ component | spn2-backoff | 115840
+ html | terminal-bad-status | 106264
+ html | null-body | 100296
+ pdf | wayback-error | 94748
+ html | blocked-cookie | 88537
+ component | no-capture | 75278
+ pdf | empty-blob | 61157
+ pdf | bad-redirect | 58680
+ pdf | skip-wall | 57751
+ pdf | spn2-error:too-many-redirects | 52873
+ html | spn2-backoff | 50577
+ pdf | remote-server-error | 41282
+ pdf | invalid-host-resolution | 38864
+ pdf | read-timeout | 37071
+ pdf | spn2-cdx-lookup-failure | 34229
+ html | wrong-mimetype | 33643
+ pdf | spn2-backoff | 32437
+ pdf | petabox-error | 31006
+ html | wayback-content-error | 28034
+ component | spn2-error | 27044
+ pdf | spn2-error:unknown | 25810
+ component | gateway-timeout | 25215
+ pdf | body-too-large | 21721
+ html | petabox-error | 18313
+ html | empty-blob | 14393
+ html | redirect-loop | 13404
+ component | blocked-cookie | 12287
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 45052391
+ pdf | | 26117481
+ pdf | 301 | 4814786
+ html | 200 | 2684821
+ pdf | 403 | 1871088
+ pdf | 404 | 1254259
+ pdf | 302 | 898728
+ pdf | 503 | 867548
+ pdf | 401 | 851205
+ pdf | 429 | 741869
+ pdf | 400 | 624519
+ component | | 456915
+ html | | 442051
+ pdf | 500 | 283700
+ component | 200 | 197510
+ pdf | 410 | 120647
+ pdf | 303 | 107947
+ html | 404 | 80114
+ pdf | 420 | 26722
+ pdf | 502 | 19500
+ pdf | 409 | 15499
+ html | 429 | 15208
+ pdf | 509 | 15167
+ pdf | 999 | 12186
+ pdf | 202 | 11535
+ html | 301 | 10213
+ xml | | 10018
+ pdf | 307 | 8657
+ pdf | 402 | 8338
+ pdf | 412 | 8064
+ pdf | 308 | 6479
+ html | 500 | 4746
+ xml | 200 | 2668
+ pdf | 520 | 2496
+ html | 302 | 2289
+ pdf | 521 | 2257
+ html | 202 | 2177
+ pdf | 206 | 1961
+ html | 403 | 1775
+ pdf | 504 | 1187
+ pdf | 421 | 1148
+ html | 303 | 1112
+ pdf | 406 | 1109
+ pdf | 204 | 772
+ pdf | 432 | 745
+ pdf | 405 | 633
+ html | 400 | 632
+ pdf | 426 | 515
+ pdf | 508 | 503
+ pdf | 505 | 469
+ (50 rows)
diff --git a/sql/stats/2022-11-23_table_sizes.txt b/sql/stats/2022-11-23_table_sizes.txt
new file mode 100644
index 0000000..0a6254a
--- /dev/null
+++ b/sql/stats/2022-11-23_table_sizes.txt
@@ -0,0 +1,21 @@
+PostgreSQL 13.2 - wbgrp-svc506.us.archive.org
+Size: 1.13T
+
+ table_name | table_size | indexes_size | total_size
+------------------------------------+------------+--------------+------------
+ "public"."crossref" | 459 GB | 10 GB | 470 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 63 GB | 45 GB | 108 GB
+ "public"."ingest_request" | 53 GB | 52 GB | 105 GB
+ "public"."ingest_file_result" | 46 GB | 55 GB | 100 GB
+ "public"."file_meta" | 39 GB | 40 GB | 79 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."pdf_meta" | 24 GB | 7466 MB | 31 GB
+ "public"."grobid_refs" | 28 GB | 3306 MB | 31 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 7879 MB | 68 MB | 7947 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
diff --git a/sql/stats/README.md b/sql/stats/README.md
new file mode 100644
index 0000000..3161514
--- /dev/null
+++ b/sql/stats/README.md
@@ -0,0 +1,109 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files FROM grobid;
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
diff --git a/sql/table_sizes.md b/sql/table_sizes.md
new file mode 100644
index 0000000..3596b2b
--- /dev/null
+++ b/sql/table_sizes.md
@@ -0,0 +1,11 @@
+
+## September 2019
+
+ table_name | table_size | indexes_size | total_size
+ --------------------------------------------------------------+------------+--------------+------------
+ "public"."cdx" | 31 GB | 27 GB | 58 GB
+ "public"."file_meta" | 13 GB | 6500 MB | 19 GB
+ "public"."shadow" | 8303 MB | 9216 MB | 17 GB
+ "public"."grobid" | 4994 MB | 6678 MB | 11 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB